• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h"
16 
17 #include <stddef.h>
18 
19 #include <cstdint>
20 #include <vector>
21 
22 // Only use multi-threaded Eigen if ruy is disabled.
23 #if !defined(TFLITE_WITH_RUY)
24 #define TFLITE_WITH_MULTITHREADED_EIGEN
25 #endif
26 
27 #include "tensorflow/lite/c/builtin_op_data.h"
28 #include "tensorflow/lite/c/common.h"
29 #include "tensorflow/lite/kernels/cpu_backend_context.h"
30 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
31 #include "tensorflow/lite/kernels/eigen_support.h"
32 #endif
33 #include "tensorflow/lite/kernels/internal/compatibility.h"
34 #include "tensorflow/lite/kernels/internal/types.h"
35 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
36 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
37 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
38 #endif
39 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
40 #include "tensorflow/lite/kernels/internal/quantization_util.h"
41 #include "tensorflow/lite/kernels/internal/reference/conv.h"
42 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
43 #include "tensorflow/lite/kernels/internal/tensor.h"
44 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
45 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
46 #include "tensorflow/lite/kernels/kernel_util.h"
47 #include "tensorflow/lite/kernels/padding.h"
48 #include "tensorflow/lite/util.h"
49 
50 namespace tflite {
51 namespace ops {
52 namespace builtin {
53 namespace conv {
54 
55 // This file has 4 implementation of Conv.
56 enum KernelType {
57   kReference,
58   kGenericOptimized,  // Neon-free
59   // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
60   // are available and kGenericOptimized when we must use only one thread.
61   kMultithreadOptimized,
62   // The kernel uses use CBLAS interface for matrix multiplication.
63   // It's fast when an optimized CBLAS implementation is available (e.g. Apple
64   // Accelerate Framework), and it's slow when falling back to naive
65   // implementation.
66   kCblasOptimized,
67 };
68 
69 const int kTensorNotAllocated = -1;
70 
71 static constexpr size_t kMaxIm2colBufferSizeMobile = 1024 * 1024 * 1024;  // 1GB
72 
73 struct OpData {
74   // IDs are the arbitrary identifiers used by TF Lite to identify and access
75   // memory buffers.
76   int im2col_id = kTensorNotAllocated;
77   int hwcn_weights_id = kTensorNotAllocated;
78   int input_quantized_id = kTensorNotAllocated;
79   int scaling_factors_id = kTensorNotAllocated;
80   int input_offset_id = kTensorNotAllocated;
81   int accum_scratch_id = kTensorNotAllocated;
82   // Row sums are used to cache filter sums for hybrid zero-point calculations.
83   int row_sums_id = kTensorNotAllocated;
84 
85   TfLitePaddingValues padding;
86   // The scaling factor from input to output (aka the 'real multiplier') can
87   // be represented as a fixed point multiplier plus a left shift.
88   int32_t output_multiplier;
89   int output_shift;
90 
91   // Per channel output multiplier and shift.
92   std::vector<int32_t> per_channel_output_multiplier;
93   std::vector<int> per_channel_output_shift;
94 
95   // The range of the fused activation layer. For example for kNone and
96   // uint8_t these would be 0 and 255.
97   int32_t output_activation_min;
98   int32_t output_activation_max;
99   // Indexes are the offset to the memory buffer in the array used to keep track
100   // of the allocated temporaries.
101   int32_t im2col_index;
102   int32_t hwcn_weights_index;
103   int32_t input_quantized_index;
104   int32_t scaling_factors_index;
105   int32_t accum_scratch_index;
106   int32_t input_offset_index;
107   int32_t row_sums_index;
108 
109   bool need_hwcn_weights = false;
110   bool have_weights_been_transposed = false;
111   bool need_im2col = false;
112   // If it's true, it means im2col is needed but gets disabled because the
113   // temporary im2col tensor requires too much memory (i.e.
114   // >= kMaxIm2colBufferSize);
115   bool im2col_oversized = false;
116 
117   bool supports_multithreaded_kernel = false;
118   bool is_hybrid_per_channel = false;
119   bool compute_hybrid_row_sums = true;
120 
121   // Number of convolution groups.
122   int32_t groups = 1;
123 };
124 
RuntimePaddingType(TfLitePadding padding)125 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
126   switch (padding) {
127     case TfLitePadding::kTfLitePaddingSame:
128       return PaddingType::kSame;
129     case TfLitePadding::kTfLitePaddingValid:
130       return PaddingType::kValid;
131     case TfLitePadding::kTfLitePaddingUnknown:
132     default:
133       return PaddingType::kNone;
134   }
135 }
136 
Init(TfLiteContext * context,const char * buffer,size_t length)137 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
138   // This is a builtin op, so we don't use the contents in 'buffer', if any.
139   // Instead, we allocate a new object to use as scratch space for im2col, and
140   // to carry information from Prepare() to Eval().
141   auto* data = new OpData;
142 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
143   eigen_support::IncrementUsageCounter(context);
144 #endif
145   return data;
146 }
147 
Free(TfLiteContext * context,void * buffer)148 void Free(TfLiteContext* context, void* buffer) {
149 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
150   eigen_support::DecrementUsageCounter(context);
151 #endif
152   delete reinterpret_cast<OpData*>(buffer);
153 }
154 
155 // Naive implementation of transpose for floats. Could be optimized to be more
156 // cache friendly, but for now it's a one-time cost on first run, and we would
157 // prefer to remove the need to do this at all eventually.
TransposeFloatTensor(const TfLiteTensor * input,TfLiteTensor * output)158 void TransposeFloatTensor(const TfLiteTensor* input, TfLiteTensor* output) {
159   const int rows = output->dims->data[1];
160   const int cols = output->dims->data[0];
161   const float* input_data = GetTensorData<float>(input);
162   float* output_data = GetTensorData<float>(output);
163   for (int i = 0; i < rows; ++i) {
164     for (int j = 0; j < cols; ++j) {
165       const float in_value = input_data[i * cols + j];
166       output_data[j * rows + i] = in_value;
167     }
168   }
169 }
170 
171 // Check if im2col needs to be allocated, as some version of optimized Conv dont
172 // use it. If any change is supporting im2col in any of the Conv versions, then
173 // it should be updated here as well
IsIm2ColRequired(const TfLiteTensor * input,TfLiteConvParams * params,const TfLiteTensor * filter,OpData * data,bool is_hybrid,KernelType kernel_type)174 bool IsIm2ColRequired(const TfLiteTensor* input, TfLiteConvParams* params,
175                       const TfLiteTensor* filter, OpData* data, bool is_hybrid,
176                       KernelType kernel_type) {
177   // If HWCN weights are required, Im2Col not required
178   if (data->need_hwcn_weights) return false;
179 
180   // segregate based on dilated conv & non-dialated conv
181   const bool need_dilated_im2col =
182       params->dilation_width_factor != 1 || params->dilation_height_factor != 1;
183   const bool need_non_dilated_im2col =
184       params->stride_width != 1 || params->stride_height != 1 ||
185       filter->dims->data[2] != 1 || filter->dims->data[1] != 1;
186 
187   const bool need_im2col = need_dilated_im2col || need_non_dilated_im2col;
188 
189   // Return early as basic requirement is not met
190   if (!need_im2col) return false;
191 
192   // Special case for Hybrid, as it supports only non-dilated im2col currently
193   const bool is_hybrid_non_dilated = is_hybrid && need_non_dilated_im2col;
194   const bool is_quantized = input->type == kTfLiteUInt8 ||
195                             input->type == kTfLiteInt8 ||
196                             input->type == kTfLiteInt16;
197 
198   switch (kernel_type) {
199     case kReference:
200       if (is_hybrid) {
201         return true;
202       } else {
203         return false;
204       }
205     case kGenericOptimized:
206     case kCblasOptimized:
207       if (is_hybrid && !need_non_dilated_im2col) {
208         return false;
209       } else {
210         return true;
211       }
212     case kMultithreadOptimized:
213       if (is_hybrid_non_dilated || is_quantized ||
214           !data->supports_multithreaded_kernel) {
215         return true;
216       } else {
217         return false;
218       }
219     default:
220       return false;
221   }
222 }
223 
224 // Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
225 // Note: `context->AddTensors` might invalidate pointers to existing tensors.
226 // Therefore the logic to add tensors are isolated into this function.
AllocateTemporaryTensorsIfRequired(TfLiteContext * context,TfLiteNode * node,bool is_hybrid,bool is_per_channel,KernelType kernel_type,size_t im2col_bytes)227 static TfLiteStatus AllocateTemporaryTensorsIfRequired(
228     TfLiteContext* context, TfLiteNode* node, bool is_hybrid,
229     bool is_per_channel, KernelType kernel_type, size_t im2col_bytes) {
230   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
231   OpData* data = reinterpret_cast<OpData*>(node->user_data);
232 
233   TF_LITE_ENSURE(context, node->inputs->size >= 2);
234   const TfLiteTensor* input;
235   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
236   const TfLiteTensor* filter;
237   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
238 
239   // If we're using the optimized multithreaded EigenTensor implementation of
240   // convolution, it expects the filter weights to be transposed compared to
241   // the normal TF Lite buffer format. Typical TF Lite weights are
242   // [filter_count, filter_height, filter_width, input_depth], but for the float
243   // implementation we need them as [filter_height, filter_width, input_depth,
244   // filter_count]. We get to that format by transposing, and create a temporary
245   // buffer to store the results.
246   // This path is only used for float processing, so only create the buffer if
247   // we're running with that data type.
248   data->need_hwcn_weights =
249       input->type == kTfLiteFloat32 && data->supports_multithreaded_kernel;
250 
251   // We don't always need to allocate im2col. It is only used in some versions
252   // of the optimized Conv. This test just mimics something that happens inside
253   // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
254   data->need_im2col =
255       IsIm2ColRequired(input, params, filter, data, is_hybrid, kernel_type);
256 
257   // If im2col_oversized is found to be true, we have to fallback to an
258   // execution path (like kReference in float/quantized cases) that doesn't
259   // require im2col operation. Therefore, we have to skip checking the hybrid
260   // case (but not the hybrid-per-channel one) where there's no such a fallback
261   // execution path.
262   // TODO(b/178743262): Consider making this check conditioned on the available
263   // memory of the system, rather than coupling to the mobile platform check.
264   if (IsMobilePlatform() && !(is_hybrid && !is_per_channel) &&
265       data->need_im2col && im2col_bytes >= kMaxIm2colBufferSizeMobile) {
266     data->need_im2col = false;
267     data->im2col_oversized = true;
268   }
269   int temporaries_count = 0;
270   if (data->need_im2col) {
271     data->im2col_index = temporaries_count;
272     if (data->im2col_id == kTensorNotAllocated) {
273       context->AddTensors(context, 1, &data->im2col_id);
274     }
275     ++temporaries_count;
276   }
277   if (data->need_hwcn_weights) {
278     data->hwcn_weights_index = temporaries_count;
279     if (data->hwcn_weights_id == kTensorNotAllocated) {
280       context->AddTensors(context, 1, &data->hwcn_weights_id);
281     }
282     ++temporaries_count;
283   }
284 
285   if (is_hybrid) {
286     // Allocate tensor to store the on-the-fly quantized inputs.
287     data->input_quantized_index = temporaries_count;
288     if (data->input_quantized_id == kTensorNotAllocated) {
289       TF_LITE_ENSURE_OK(
290           context, context->AddTensors(context, 1, &data->input_quantized_id));
291     }
292     ++temporaries_count;
293 
294     // Allocate tensor to store the quantization params computed during
295     // on-the-fly input quantization.
296     data->scaling_factors_index = temporaries_count;
297     if (data->scaling_factors_id == kTensorNotAllocated) {
298       TF_LITE_ENSURE_OK(
299           context, context->AddTensors(context, 1, &data->scaling_factors_id));
300     }
301     ++temporaries_count;
302 
303     // Allocate tensor to store the accumulators for the matrix multiply.
304     data->accum_scratch_index = temporaries_count;
305     if (data->accum_scratch_id == kTensorNotAllocated) {
306       TF_LITE_ENSURE_OK(
307           context, context->AddTensors(context, 1, &data->accum_scratch_id));
308     }
309     ++temporaries_count;
310     if (is_per_channel) {
311       data->input_offset_index = temporaries_count;
312       if (data->input_offset_id == kTensorNotAllocated) {
313         TF_LITE_ENSURE_OK(
314             context, context->AddTensors(context, 1, &data->input_offset_id));
315       }
316       ++temporaries_count;
317 
318       data->row_sums_index = temporaries_count;
319       if (data->row_sums_id == kTensorNotAllocated) {
320         TF_LITE_ENSURE_OK(context,
321                           context->AddTensors(context, 1, &data->row_sums_id));
322       }
323       ++temporaries_count;
324     }
325   }
326 
327   TfLiteIntArrayFree(node->temporaries);
328   node->temporaries = TfLiteIntArrayCreate(temporaries_count);
329 
330   return kTfLiteOk;
331 }
332 
Prepare(KernelType kernel_type,TfLiteContext * context,TfLiteNode * node)333 TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
334                      TfLiteNode* node) {
335   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
336   OpData* data = reinterpret_cast<OpData*>(node->user_data);
337 
338   bool has_bias = node->inputs->size == 3;
339   // Check number of inputs/outputs
340   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
341   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
342   TfLiteTensor* output;
343   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
344   const TfLiteTensor* input;
345   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
346   const TfLiteTensor* filter;
347   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
348 
349   // Check dimensionality of input, filter
350   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
351   TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
352   // Check input channels matching filter
353   // Filter input channel can be a factor of channels of input (grouped conv)
354   // or equals (normal conv).
355   auto input_channel = input->dims->data[3];
356   auto filter_input_channel = filter->dims->data[3];
357   TF_LITE_ENSURE_EQ(context, input_channel % filter_input_channel, 0);
358   data->groups = input_channel / filter_input_channel;
359 
360   // Check types. (We assume that UINT8 refers to quantized tensors)
361   TfLiteType input_type = input->type;
362   TF_LITE_ENSURE(context,
363                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
364                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16);
365   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
366 
367   if (input_type == kTfLiteInt16) {
368     TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
369     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
370   }
371   // Filter must have zero zero-points in per-channel quantization.
372   if (input_type == kTfLiteInt16 || input_type == kTfLiteInt8) {
373     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
374                       kTfLiteAffineQuantization);
375     const auto* affine_quantization =
376         reinterpret_cast<TfLiteAffineQuantization*>(
377             filter->quantization.params);
378     for (int i = 0; i < affine_quantization->zero_point->size; ++i) {
379       TF_LITE_ENSURE_EQ(context, affine_quantization->zero_point->data[i], 0);
380     }
381   }
382 
383   const TfLiteTensor* bias = nullptr;
384 
385   // TODO(ahentz): At this point the optimized versions require 'bias'. We can
386   // either change that or document that convolution requires it.
387   TF_LITE_ENSURE(context, has_bias);
388 
389   if (has_bias) {
390     TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &bias));
391     if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
392       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
393       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
394     } else if (input_type == kTfLiteInt16) {
395       TF_LITE_ENSURE(context, (bias->type == kTfLiteInt32) ||
396                                   (bias->type == kTfLiteInt64));
397       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
398     } else {
399       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type);
400     }
401     TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
402   }
403 
404   const bool is_hybrid =
405       (input->type == kTfLiteFloat32 &&
406        (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
407 
408   if (is_hybrid && filter->type == kTfLiteInt8 &&
409       filter->quantization.type == kTfLiteAffineQuantization &&
410       filter->quantization.params &&
411       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
412           ->scale &&
413       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
414               ->scale->size > 1) {
415     const auto* affine_quantization =
416         reinterpret_cast<TfLiteAffineQuantization*>(
417             filter->quantization.params);
418     const float scale = affine_quantization->scale->data[0];
419     for (int i = 1; i < affine_quantization->scale->size; i++) {
420       if (affine_quantization->scale->data[i] != scale) {
421         data->is_hybrid_per_channel = true;
422         break;
423       }
424     }
425   }
426 
427   // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
428   // is incompatible with mutable input filters that might change between evals.
429   data->supports_multithreaded_kernel =
430       (kernel_type == kMultithreadOptimized) &&
431       (context->recommended_num_threads != 1) && !is_hybrid &&
432       (params->dilation_width_factor == 1) &&
433       (params->dilation_height_factor == 1) &&
434       (filter->allocation_type != kTfLiteArenaRw) && !IsDynamicTensor(filter);
435 
436   int channels_in = filter->dims->data[3];
437   int channels_out = filter->dims->data[0];
438   int width = input->dims->data[2];
439   int height = input->dims->data[1];
440   int filter_width = filter->dims->data[2];
441   int filter_height = filter->dims->data[1];
442   int batches = input->dims->data[0];
443 
444   // Matching GetWindowedOutputSize in TensorFlow.
445   auto padding = params->padding;
446   int out_width, out_height;
447   data->padding = ComputePaddingHeightWidth(
448       params->stride_height, params->stride_width,
449       params->dilation_height_factor, params->dilation_width_factor, height,
450       width, filter_height, filter_width, padding, &out_height, &out_width);
451 
452   size_t im2col_type_size;
453   TF_LITE_ENSURE_STATUS(GetSizeOfType(context, input->type, &im2col_type_size));
454   // Note that we intentionally promote the first multiplicand (i.e. 'batches')
455   // to 'size_t' to avoid integer overflow here.
456   const size_t im2col_bytes = static_cast<size_t>(batches) * out_height *
457                               out_width * channels_in * filter_height *
458                               filter_width * im2col_type_size;
459   TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
460       context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type,
461       im2col_bytes));
462 
463   TF_LITE_ENSURE(context, has_bias);
464 
465   // Note that full fixed-point inference requires that all tensors have their
466   // parameters set. This is usually done during quantized training or
467   // calibration.
468   if (input_type != kTfLiteFloat32) {
469     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
470                       kTfLiteAffineQuantization);
471     const auto* affine_quantization =
472         reinterpret_cast<TfLiteAffineQuantization*>(
473             filter->quantization.params);
474     TF_LITE_ENSURE(context, affine_quantization);
475     TF_LITE_ENSURE(context, affine_quantization->scale);
476     TF_LITE_ENSURE(context, (affine_quantization->scale->size == 1 ||
477                              affine_quantization->scale->size == channels_out));
478 
479     data->per_channel_output_multiplier.resize(channels_out);
480     data->per_channel_output_shift.resize(channels_out);
481     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
482         context, input, filter, bias, output, params->activation,
483         &data->output_multiplier, &data->output_shift,
484         &data->output_activation_min, &data->output_activation_max,
485         data->per_channel_output_multiplier.data(),
486         data->per_channel_output_shift.data(), channels_out));
487   }
488 
489   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
490   output_size->data[0] = batches;
491   output_size->data[1] = out_height;
492   output_size->data[2] = out_width;
493   output_size->data[3] = channels_out;
494   auto output_status = context->ResizeTensor(context, output, output_size);
495 
496   if (output_status != kTfLiteOk) return output_status;
497 
498   if (data->need_im2col) {
499     node->temporaries->data[data->im2col_index] = data->im2col_id;
500 
501     TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
502 
503     auto filter_input_channel = filter->dims->data[3];
504     im2col_size->data[0] = output_size->data[0];
505     im2col_size->data[1] = output_size->data[1];
506     im2col_size->data[2] = output_size->data[2];
507     im2col_size->data[3] = filter_input_channel * filter_height * filter_width;
508 
509     TfLiteTensor* im2col =
510         &context->tensors[node->temporaries->data[data->im2col_index]];
511     im2col->type = input->type;
512     if (is_hybrid) {
513       im2col->type = filter->type;
514     }
515     im2col->allocation_type = kTfLiteArenaRw;
516     auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
517     if (im2col_status != kTfLiteOk) return im2col_status;
518   }
519 
520   if (data->need_hwcn_weights) {
521     node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
522     TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
523 
524     // Because we're treating the filter weights as a matrix when we do the
525     // transpose, we allocate the buffer with a two-dimensional shape, where one
526     // dimension is the number of elements in each filter, and the second is the
527     // total number of filters.
528     auto filter_input_channel = filter->dims->data[3];
529     hwcn_weights_size->data[0] =
530         (filter_height * filter_width * filter_input_channel);
531     hwcn_weights_size->data[1] = channels_out;
532 
533     TfLiteTensor* hwcn_weights =
534         &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
535     hwcn_weights->type = input_type;
536     hwcn_weights->name = "Conv_hwcn_weights";
537     hwcn_weights->allocation_type = kTfLiteArenaRwPersistent;
538 
539     auto hwcn_weights_status =
540         context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
541     if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
542 
543     // TODO(petewarden): If Resize() is called when the size hasn't actually
544     // changed, this will do extra redundant work.
545     data->have_weights_been_transposed = false;
546   }
547 
548   if (is_hybrid) {
549     node->temporaries->data[data->input_quantized_index] =
550         data->input_quantized_id;
551     TfLiteTensor* input_quantized;
552     TF_LITE_ENSURE_OK(
553         context, GetTemporarySafe(context, node, data->input_quantized_index,
554                                   &input_quantized));
555     input_quantized->type = kTfLiteInt8;
556     input_quantized->allocation_type = kTfLiteArenaRw;
557     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
558       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
559       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
560                                                        input_quantized_size));
561     }
562 
563     node->temporaries->data[data->scaling_factors_index] =
564         data->scaling_factors_id;
565     TfLiteTensor* scaling_factors;
566     TF_LITE_ENSURE_OK(
567         context, GetTemporarySafe(context, node, data->scaling_factors_index,
568                                   &scaling_factors));
569     scaling_factors->type = kTfLiteFloat32;
570     scaling_factors->allocation_type = kTfLiteArenaRw;
571     // Only one scale factor per batch is typically necessary. See optimized
572     // implementation for why we need to allocate for the height of the inputs
573     // flattened to 2D.
574     TF_LITE_ENSURE(context, channels_in != 0);
575     const int height = NumElements(input) / channels_in;
576     int scaling_dims[1] = {height};
577     if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
578       TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
579       scaling_factors_size->data[0] = height;
580       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
581                                                        scaling_factors_size));
582     }
583 
584     node->temporaries->data[data->accum_scratch_index] = data->accum_scratch_id;
585     TfLiteTensor* accum_scratch;
586     TF_LITE_ENSURE_OK(context,
587                       GetTemporarySafe(context, node, data->accum_scratch_index,
588                                        &accum_scratch));
589     accum_scratch->type = kTfLiteInt32;
590     accum_scratch->allocation_type = kTfLiteArenaRw;
591     const int scratch_width = batches * out_height * out_width;
592     int accum_scratch_dims[2] = {channels_out, scratch_width};
593     if (!TfLiteIntArrayEqualsArray(accum_scratch->dims, 2,
594                                    accum_scratch_dims)) {
595       TfLiteIntArray* accum_scratch_size = TfLiteIntArrayCreate(2);
596       accum_scratch_size->data[0] = channels_out;
597       accum_scratch_size->data[1] = scratch_width;
598       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, accum_scratch,
599                                                        accum_scratch_size));
600     }
601 
602     if (data->is_hybrid_per_channel) {
603       const auto* affine_quantization =
604           reinterpret_cast<TfLiteAffineQuantization*>(
605               filter->quantization.params);
606       TF_LITE_ENSURE_EQ(
607           context, affine_quantization->scale->size,
608           filter->dims->data[affine_quantization->quantized_dimension]);
609       node->temporaries->data[data->input_offset_index] = data->input_offset_id;
610       TfLiteTensor* input_offsets;
611       TF_LITE_ENSURE_OK(
612           context, GetTemporarySafe(context, node, data->input_offset_index,
613                                     &input_offsets));
614       input_offsets->type = kTfLiteInt32;
615       input_offsets->allocation_type = kTfLiteArenaRw;
616       // See above comment for the need to allocate for height of inputs.
617       TF_LITE_ENSURE(context, channels_in != 0);
618       const int height = NumElements(input) / channels_in;
619       const int input_offset_dims[1] = {height};
620       if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1,
621                                      input_offset_dims)) {
622         TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1);
623         input_offsets_size->data[0] = input_offset_dims[0];
624         TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
625                                                          input_offsets_size));
626       }
627       node->temporaries->data[data->row_sums_index] = data->row_sums_id;
628       TfLiteTensor* row_sums;
629       TF_LITE_ENSURE_OK(
630           context,
631           GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
632       row_sums->type = kTfLiteInt32;
633       row_sums->name = "Conv_row_sums";
634       row_sums->allocation_type = kTfLiteArenaRwPersistent;
635       // See above comment for the need to allocate for height of inputs.
636       const int row_sums_dims[1] = {channels_out};
637       if (!TfLiteIntArrayEqualsArray(row_sums->dims, 1, row_sums_dims)) {
638         TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(1);
639         row_sums_size->data[0] = row_sums_dims[0];
640         TF_LITE_ENSURE_OK(
641             context, context->ResizeTensor(context, row_sums, row_sums_size));
642       }
643     }
644   }
645   return kTfLiteOk;
646 }
647 
648 template <KernelType kernel_type>
Prepare(TfLiteContext * context,TfLiteNode * node)649 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
650   return Prepare(kernel_type, context, node);
651 }
652 
653 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * output)654 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
655                    TfLiteConvParams* params, OpData* data,
656                    const TfLiteTensor* input, const TfLiteTensor* filter,
657                    const TfLiteTensor* bias, TfLiteTensor* im2col,
658                    TfLiteTensor* output) {
659   auto input_offset = -input->params.zero_point;
660   auto filter_offset = -filter->params.zero_point;
661   auto output_offset = output->params.zero_point;
662 
663   KernelType effective_kernel_type;
664   if ((kernel_type == kMultithreadOptimized ||
665        kernel_type == kCblasOptimized) &&
666       (params->dilation_width_factor != 1 ||
667        params->dilation_height_factor != 1)) {
668     // kMultithreadOptimized and kCblasOptimized do not support dilation.
669     // Therefore, fallback to optimized.
670     effective_kernel_type = kGenericOptimized;
671   } else {
672     effective_kernel_type = kernel_type;
673   }
674 
675   // We have to fallback to reference execution path when im2col is needed but
676   // disabled because to-be-allocated temporary im2col tensor is too large.
677   // See b/178743262 for the detailed motivation.
678   if (data->im2col_oversized) {
679     effective_kernel_type = kReference;
680   }
681 
682   // Grouped convolution is right now only supported on reference kernel.
683   if (data->groups != 1) {
684     effective_kernel_type = kReference;
685   }
686 
687   ConvParams op_params;
688   op_params.padding_type = PaddingType::kSame;
689   op_params.padding_values.width = data->padding.width;
690   op_params.padding_values.height = data->padding.height;
691   op_params.dilation_width_factor = params->dilation_width_factor;
692   op_params.dilation_height_factor = params->dilation_height_factor;
693   op_params.stride_width = params->stride_width;
694   op_params.stride_height = params->stride_height;
695   op_params.input_offset = input_offset;
696   op_params.weights_offset = filter_offset;
697   op_params.output_offset = output_offset;
698   op_params.output_multiplier = data->output_multiplier;
699   op_params.output_shift = -data->output_shift;
700   op_params.quantized_activation_min = data->output_activation_min;
701   op_params.quantized_activation_max = data->output_activation_max;
702   switch (effective_kernel_type) {
703     case kReference: {
704       reference_ops::Conv(
705           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
706           GetTensorShape(filter), GetTensorData<uint8_t>(filter),
707           GetTensorShape(bias), GetTensorData<int32_t>(bias),
708           GetTensorShape(output), GetTensorData<uint8_t>(output),
709           GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
710           /* cpu_backend_context = */ nullptr);
711       break;
712     }
713     case kGenericOptimized:
714     case kMultithreadOptimized:
715     case kCblasOptimized: {
716       // There is only one optimized implementation for Quantized Conv.
717       optimized_ops::Conv(
718           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
719           GetTensorShape(filter), GetTensorData<uint8_t>(filter),
720           GetTensorShape(bias), GetTensorData<int32_t>(bias),
721           GetTensorShape(output), GetTensorData<uint8_t>(output),
722           GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
723           CpuBackendContext::GetFromContext(context));
724       break;
725     }
726   }
727 }
728 
729 template <KernelType kernel_type>
EvalQuantizedPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output,TfLiteTensor * im2col)730 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
731                              TfLiteConvParams* params, OpData* data,
732                              const TfLiteTensor* input,
733                              const TfLiteTensor* filter,
734                              const TfLiteTensor* bias, TfLiteTensor* output,
735                              TfLiteTensor* im2col) {
736   ConvParams op_params;
737   op_params.input_offset = -input->params.zero_point;
738   op_params.output_offset = output->params.zero_point;
739   op_params.stride_height = params->stride_height;
740   op_params.stride_width = params->stride_width;
741   op_params.dilation_height_factor = params->dilation_height_factor;
742   op_params.dilation_width_factor = params->dilation_width_factor;
743   op_params.padding_values.height = data->padding.height;
744   op_params.padding_values.width = data->padding.width;
745   op_params.quantized_activation_min = data->output_activation_min;
746   op_params.quantized_activation_max = data->output_activation_max;
747 
748   KernelType effective_kernel_type = kernel_type;
749   // We have to fallback to reference execution path when im2col is needed but
750   // disabled because to-be-allocated temporary im2col tensor is too large.
751   // See b/178743262 for the detailed motivation.
752   if (data->im2col_oversized) {
753     effective_kernel_type = kReference;
754   }
755 
756   // Grouped convolution is right now only supported on reference kernel.
757   if (data->groups != 1) {
758     effective_kernel_type = kReference;
759   }
760 
761   switch (effective_kernel_type) {
762     case kReference: {
763       reference_integer_ops::ConvPerChannel(
764           op_params, data->per_channel_output_multiplier.data(),
765           data->per_channel_output_shift.data(), GetTensorShape(input),
766           GetTensorData<int8>(input), GetTensorShape(filter),
767           GetTensorData<int8>(filter), GetTensorShape(bias),
768           GetTensorData<int32>(bias), GetTensorShape(output),
769           GetTensorData<int8>(output));
770       break;
771     }
772     case kGenericOptimized:
773     case kMultithreadOptimized:
774     case kCblasOptimized: {
775       optimized_integer_ops::ConvPerChannel(
776           op_params, data->per_channel_output_multiplier.data(),
777           data->per_channel_output_shift.data(), GetTensorShape(input),
778           GetTensorData<int8>(input), GetTensorShape(filter),
779           GetTensorData<int8>(filter), GetTensorShape(bias),
780           GetTensorData<int32>(bias), GetTensorShape(output),
781           GetTensorData<int8>(output), GetTensorShape(im2col),
782           GetTensorData<int8>(im2col),
783           CpuBackendContext::GetFromContext(context));
784       break;
785     }
786   }
787 }
788 
789 template <KernelType kernel_type>
EvalQuantizedPerChannel16x8(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output,TfLiteTensor * im2col)790 void EvalQuantizedPerChannel16x8(TfLiteContext* context, TfLiteNode* node,
791                                  TfLiteConvParams* params, OpData* data,
792                                  const TfLiteTensor* input,
793                                  const TfLiteTensor* filter,
794                                  const TfLiteTensor* bias, TfLiteTensor* output,
795                                  TfLiteTensor* im2col) {
796   ConvParams op_params;
797   op_params.input_offset = -input->params.zero_point;
798   op_params.output_offset = output->params.zero_point;
799   op_params.stride_height = params->stride_height;
800   op_params.stride_width = params->stride_width;
801   op_params.dilation_height_factor = params->dilation_height_factor;
802   op_params.dilation_width_factor = params->dilation_width_factor;
803   op_params.padding_values.height = data->padding.height;
804   op_params.padding_values.width = data->padding.width;
805   op_params.quantized_activation_min = data->output_activation_min;
806   op_params.quantized_activation_max = data->output_activation_max;
807 
808   KernelType effective_kernel_type = kernel_type;
809   // We have to fallback to reference execution path when im2col is needed but
810   // disabled because to-be-allocated temporary im2col tensor is too large.
811   // See b/178743262 for the detailed motivation.
812   if (data->im2col_oversized) {
813     effective_kernel_type = kReference;
814   }
815 
816   // Grouped convolution is right now only supported on reference kernel.
817   if (data->groups != 1) {
818     effective_kernel_type = kReference;
819   }
820 
821   // To prevent 32bit accum overflow for 16x8 quantization, it enables the
822   // optimized path only when zero_point is 0.
823   bool has_non_zero_point = input->params.zero_point ||
824                             filter->params.zero_point ||
825                             output->params.zero_point;
826 
827   // Fallback to reference kernel when bias_type is int64 as
828   // there is no optimized kernel for int64 bias yet.
829   if (bias && bias->type == kTfLiteInt64) {
830     reference_integer_ops::ConvPerChannel(
831         op_params, data->per_channel_output_multiplier.data(),
832         data->per_channel_output_shift.data(), GetTensorShape(input),
833         GetTensorData<int16>(input), GetTensorShape(filter),
834         GetTensorData<int8>(filter), GetTensorShape(bias),
835         GetTensorData<std::int64_t>(bias), GetTensorShape(output),
836         GetTensorData<int16>(output));
837   } else if (effective_kernel_type == kReference || has_non_zero_point) {
838     reference_integer_ops::ConvPerChannel(
839         op_params, data->per_channel_output_multiplier.data(),
840         data->per_channel_output_shift.data(), GetTensorShape(input),
841         GetTensorData<int16>(input), GetTensorShape(filter),
842         GetTensorData<int8>(filter), GetTensorShape(bias),
843         GetTensorData<std::int32_t>(bias), GetTensorShape(output),
844         GetTensorData<int16>(output));
845   } else {
846     optimized_integer_ops::ConvPerChannel(
847         op_params, data->per_channel_output_multiplier.data(),
848         data->per_channel_output_shift.data(), GetTensorShape(input),
849         GetTensorData<int16_t>(input), GetTensorShape(filter),
850         GetTensorData<int8_t>(filter), GetTensorShape(bias),
851         GetTensorData<std::int32_t>(bias), GetTensorShape(output),
852         GetTensorData<int16_t>(output), GetTensorShape(im2col),
853         GetTensorData<int16_t>(im2col),
854         CpuBackendContext::GetFromContext(context));
855   }
856 }
857 
858 template <KernelType kernel_type>
EvalFloat(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)859 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
860                TfLiteConvParams* params, OpData* data,
861                const TfLiteTensor* input, const TfLiteTensor* filter,
862                const TfLiteTensor* bias, TfLiteTensor* im2col,
863                TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
864   float output_activation_min, output_activation_max;
865   CalculateActivationRange(params->activation, &output_activation_min,
866                            &output_activation_max);
867   KernelType effective_kernel_type = kernel_type;
868   // Fall back to the optimized path if multi-threaded conv is unsupported.
869   if ((kernel_type == kMultithreadOptimized) &&
870       !data->supports_multithreaded_kernel) {
871     effective_kernel_type = kGenericOptimized;
872   }
873 
874   // When im2col is needed (which is implied when 'im2col_oversized' is true),
875   // the GEMMM-based optimized path requires im2col data be allocated to ensure
876   // the correctness. Therefore, when im2col is disabled because of the
877   // oversized temporary im2col tensor, fallback to a non-optimized path is
878   // needed.
879   // See b/178743262 for the detailed motivation.
880   if (data->im2col_oversized) {
881     effective_kernel_type = kReference;
882 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
883     // As detailed by tflite::multithreaded_ops::Conv implementation in
884     // multithreaded_conv.h, the Eigen-based execution doesn't need im2col data.
885     // Therefore, we could rely on it as a better-optimized fallback than the
886     // reference one.
887     if (data->supports_multithreaded_kernel) {
888       effective_kernel_type = kMultithreadOptimized;
889     }
890 #endif
891   }
892 
893   // Grouped convolution is right now only supported on reference kernel.
894   if (data->groups != 1) {
895     effective_kernel_type = kReference;
896   }
897 
898   ConvParams op_params;
899   op_params.padding_type = RuntimePaddingType(params->padding);
900   op_params.padding_values.width = data->padding.width;
901   op_params.padding_values.height = data->padding.height;
902   op_params.stride_width = params->stride_width;
903   op_params.stride_height = params->stride_height;
904   op_params.dilation_width_factor = params->dilation_width_factor;
905   op_params.dilation_height_factor = params->dilation_height_factor;
906   op_params.float_activation_min = output_activation_min;
907   op_params.float_activation_max = output_activation_max;
908   switch (effective_kernel_type) {
909     case kReference: {
910       reference_ops::Conv(op_params, GetTensorShape(input),
911                           GetTensorData<float>(input), GetTensorShape(filter),
912                           GetTensorData<float>(filter), GetTensorShape(bias),
913                           GetTensorData<float>(bias), GetTensorShape(output),
914                           GetTensorData<float>(output), GetTensorShape(im2col),
915                           GetTensorData<float>(im2col));
916       break;
917     }
918     case kCblasOptimized:
919     case kGenericOptimized: {
920       optimized_ops::Conv(op_params, GetTensorShape(input),
921                           GetTensorData<float>(input), GetTensorShape(filter),
922                           GetTensorData<float>(filter), GetTensorShape(bias),
923                           GetTensorData<float>(bias), GetTensorShape(output),
924                           GetTensorData<float>(output), GetTensorShape(im2col),
925                           GetTensorData<float>(im2col),
926                           CpuBackendContext::GetFromContext(context));
927       break;
928     }
929     case kMultithreadOptimized: {
930 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
931       const float* filter_data;
932       if (data->need_hwcn_weights) {
933         filter_data = GetTensorData<float>(hwcn_weights);
934       } else {
935         filter_data = GetTensorData<float>(filter);
936       }
937       multithreaded_ops::Conv(
938           *eigen_support::GetThreadPoolDevice(context), op_params,
939           GetTensorShape(input), GetTensorData<float>(input),
940           GetTensorShape(filter), filter_data, GetTensorShape(bias),
941           GetTensorData<float>(bias), GetTensorShape(output),
942           GetTensorData<float>(output), GetTensorShape(im2col),
943           GetTensorData<float>(im2col));
944       break;
945 #else   // !defined(TFLITE_WITH_MULTITHREADED_EIGEN)
946       // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
947       // was enabled. We #if out this code in order to get the corresponding
948       // binary size benefits.
949       TFLITE_DCHECK(false);
950 #endif  // defined(TFLITE_WITH_MULTITHREADED_EIGEN)
951     }
952   }
953 }
954 
955 template <KernelType kernel_type>
EvalHybridPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * output)956 TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
957                                   TfLiteConvParams* params, OpData* data,
958                                   const TfLiteTensor* input,
959                                   const TfLiteTensor* filter,
960                                   const TfLiteTensor* bias,
961                                   TfLiteTensor* im2col, TfLiteTensor* output) {
962   float output_activation_min, output_activation_max;
963   CalculateActivationRange(params->activation, &output_activation_min,
964                            &output_activation_max);
965 
966   const int batch_size = SizeOfDimension(input, 0);
967   TF_LITE_ENSURE(context, batch_size != 0);
968   const int input_size = NumElements(input) / batch_size;
969   TfLiteTensor* quantized_input_tensor;
970   TF_LITE_ENSURE_OK(context,
971                     GetTemporarySafe(context, node, data->input_quantized_index,
972                                      &quantized_input_tensor));
973   int8_t* quantized_input_ptr_batch =
974       GetTensorData<int8_t>(quantized_input_tensor);
975   TfLiteTensor* scaling_factors_tensor;
976   TF_LITE_ENSURE_OK(context,
977                     GetTemporarySafe(context, node, data->scaling_factors_index,
978                                      &scaling_factors_tensor));
979   float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
980   TfLiteTensor* input_offset_tensor;
981   TF_LITE_ENSURE_OK(context,
982                     GetTemporarySafe(context, node, data->input_offset_index,
983                                      &input_offset_tensor));
984   int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
985 
986   for (int b = 0; b < batch_size; ++b) {
987     const int offset = b * input_size;
988     tensor_utils::AsymmetricQuantizeFloats(
989         GetTensorData<float>(input) + offset, input_size,
990         quantized_input_ptr_batch + offset, &scaling_factors_ptr[b],
991         &input_offset_ptr[b]);
992   }
993 
994   int8_t* im2col_ptr = nullptr;
995   int8_t* filter_ptr = nullptr;
996   if (im2col != nullptr) {
997     im2col_ptr = im2col->data.int8;
998   }
999   filter_ptr = filter->data.int8;
1000   const auto* affine_quantization =
1001       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
1002 
1003   KernelType effective_kernel_type = kernel_type;
1004   // We have to fallback to reference execution path when im2col is needed but
1005   // disabled because to-be-allocated temporary im2col tensor is too large.
1006   // See b/178743262 for the detailed motivation.
1007   if (data->im2col_oversized) {
1008     effective_kernel_type = kReference;
1009   }
1010 
1011   // Grouped convolution is right now only supported on reference kernel.
1012   if (data->groups != 1) {
1013     effective_kernel_type = kReference;
1014   }
1015 
1016   ConvParams op_params;
1017   op_params.padding_type = PaddingType::kSame;
1018   op_params.padding_values.width = data->padding.width;
1019   op_params.padding_values.height = data->padding.height;
1020   op_params.dilation_width_factor = params->dilation_width_factor;
1021   op_params.dilation_height_factor = params->dilation_height_factor;
1022   op_params.stride_width = params->stride_width;
1023   op_params.stride_height = params->stride_height;
1024   op_params.float_activation_min = output_activation_min;
1025   op_params.float_activation_max = output_activation_max;
1026   switch (effective_kernel_type) {
1027     case kReference:
1028       reference_ops::HybridConvPerChannel(
1029           op_params, scaling_factors_ptr, GetTensorShape(input),
1030           quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1031           GetTensorShape(bias), GetTensorData<float>(bias),
1032           GetTensorShape(output), GetTensorData<float>(output),
1033           GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1034           input_offset_ptr);
1035       break;
1036     case kGenericOptimized:
1037     case kMultithreadOptimized:
1038     case kCblasOptimized: {
1039       TfLiteTensor* row_sums;
1040       TF_LITE_ENSURE_OK(
1041           context,
1042           GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
1043       TfLiteTensor* scratch;
1044       TF_LITE_ENSURE_OK(
1045           context,
1046           GetTemporarySafe(context, node, data->accum_scratch_index, &scratch));
1047       optimized_ops::HybridConvPerChannel(
1048           op_params, scaling_factors_ptr, GetTensorShape(input),
1049           quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1050           GetTensorShape(bias), GetTensorData<float>(bias),
1051           GetTensorShape(output), GetTensorData<float>(output),
1052           GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1053           input_offset_ptr, GetTensorShape(scratch),
1054           GetTensorData<int32>(scratch), GetTensorData<int32_t>(row_sums),
1055           &data->compute_hybrid_row_sums,
1056           CpuBackendContext::GetFromContext(context));
1057       data->compute_hybrid_row_sums = false;
1058       break;
1059     }
1060   }
1061 
1062   return kTfLiteOk;
1063 }
1064 
1065 template <KernelType kernel_type>
EvalHybrid(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * accum_scratch,TfLiteTensor * output)1066 TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
1067                         TfLiteConvParams* params, OpData* data,
1068                         const TfLiteTensor* input, const TfLiteTensor* filter,
1069                         const TfLiteTensor* bias, TfLiteTensor* im2col,
1070                         TfLiteTensor* accum_scratch, TfLiteTensor* output) {
1071   float output_activation_min, output_activation_max;
1072   CalculateActivationRange(params->activation, &output_activation_min,
1073                            &output_activation_max);
1074 
1075   const int batch_size = SizeOfDimension(input, 0);
1076   TF_LITE_ENSURE(context, batch_size != 0);
1077   const int input_size = NumElements(input) / batch_size;
1078 
1079   const float* input_ptr = GetTensorData<float>(input);
1080   TfLiteTensor* quantized_input_tensor;
1081   TF_LITE_ENSURE_OK(context,
1082                     GetTemporarySafe(context, node, data->input_quantized_index,
1083                                      &quantized_input_tensor));
1084   int8_t* quantized_input_ptr_batch =
1085       GetTensorData<int8_t>(quantized_input_tensor);
1086   TfLiteTensor* scaling_factors_tensor;
1087   TF_LITE_ENSURE_OK(context,
1088                     GetTemporarySafe(context, node, data->scaling_factors_index,
1089                                      &scaling_factors_tensor));
1090   float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
1091 
1092   // Per-batch input quantization for higher accuracy.
1093   {
1094     ruy::profiler::ScopeLabel label("ConvHybridQuantizeInputs");
1095     for (int b = 0; b < batch_size; ++b) {
1096       float unused_min, unused_max;
1097       const int offset = b * input_size;
1098       tensor_utils::SymmetricQuantizeFloats(
1099           input_ptr + offset, input_size, quantized_input_ptr_batch + offset,
1100           &unused_min, &unused_max, &scaling_factors_ptr[b]);
1101       scaling_factors_ptr[b] *= filter->params.scale;
1102     }
1103   }
1104 
1105   switch (kernel_type) {
1106     case kReference:
1107     case kGenericOptimized:
1108     case kMultithreadOptimized:
1109     case kCblasOptimized: {
1110       // There is only one implementation for hybrid kernel.
1111       ConvParams op_params;
1112       op_params.padding_type = PaddingType::kSame;
1113       op_params.padding_values.width = data->padding.width;
1114       op_params.padding_values.height = data->padding.height;
1115       op_params.stride_width = params->stride_width;
1116       op_params.stride_height = params->stride_height;
1117       op_params.dilation_width_factor = params->dilation_width_factor;
1118       op_params.dilation_height_factor = params->dilation_height_factor;
1119       op_params.float_activation_min = output_activation_min;
1120       op_params.float_activation_max = output_activation_max;
1121       if (data->groups == 1) {
1122         optimized_ops::HybridConv(
1123             op_params, scaling_factors_ptr, GetTensorShape(input),
1124             quantized_input_ptr_batch, GetTensorShape(filter),
1125             GetTensorData<int8_t>(filter), GetTensorShape(bias),
1126             GetTensorData<float>(bias), GetTensorShape(accum_scratch),
1127             GetTensorData<int32_t>(accum_scratch), GetTensorShape(output),
1128             GetTensorData<float>(output), GetTensorShape(im2col),
1129             GetTensorData<int8_t>(im2col),
1130             CpuBackendContext::GetFromContext(context));
1131       } else {
1132         // This case is handled by (fallbacked to) per channel hybrid group conv
1133         // and shouldn't hit this branch.
1134         TF_LITE_KERNEL_LOG(
1135             context,
1136             "Group convolution currently not supported for hybrid kernel.");
1137         return kTfLiteError;
1138       }
1139       break;
1140     }
1141   }
1142 
1143   return kTfLiteOk;
1144 }
1145 
1146 template <KernelType kernel_type, TfLiteType input_type>
EvalImpl(TfLiteContext * context,TfLiteNode * node)1147 TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
1148   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
1149   OpData* data = reinterpret_cast<OpData*>(node->user_data);
1150 
1151   TfLiteTensor* output;
1152   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
1153   const TfLiteTensor* input;
1154   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1155   const TfLiteTensor* filter;
1156   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
1157   bool has_bias = node->inputs->size == 3;
1158   const TfLiteTensor* bias = has_bias ? GetInput(context, node, 2) : nullptr;
1159   TfLiteTensor* im2col =
1160       data->need_im2col
1161           ? &context->tensors[node->temporaries->data[data->im2col_index]]
1162           : nullptr;
1163   TfLiteTensor* hwcn_weights =
1164       data->need_hwcn_weights
1165           ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
1166           : nullptr;
1167 
1168   if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
1169     TransposeFloatTensor(filter, hwcn_weights);
1170     data->have_weights_been_transposed = true;
1171   }
1172 
1173   TFLITE_DCHECK_EQ(input_type, input->type);
1174   switch (input_type) {  // Already know in/outtypes are same.
1175     case kTfLiteFloat32:
1176       if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
1177         if (data->is_hybrid_per_channel ||
1178             // TODO(b/162870360): Fallback to PerChannel implementation
1179             // before we have grouped hybrid convolution.
1180             data->groups != 1) {
1181           TF_LITE_ENSURE_OK(context, EvalHybridPerChannel<kernel_type>(
1182                                          context, node, params, data, input,
1183                                          filter, bias, im2col, output));
1184         } else {
1185           TfLiteTensor* accum_scratch =
1186               &context->tensors[node->temporaries
1187                                     ->data[data->accum_scratch_index]];
1188           TF_LITE_ENSURE_OK(context,
1189                             EvalHybrid<kernel_type>(context, node, params, data,
1190                                                     input, filter, bias, im2col,
1191                                                     accum_scratch, output));
1192         }
1193       } else {
1194         EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
1195                                im2col, hwcn_weights, output);
1196       }
1197       break;
1198     case kTfLiteUInt8:
1199       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
1200                                  bias, im2col, output);
1201       break;
1202     case kTfLiteInt8:
1203       EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
1204                                            filter, bias, output, im2col);
1205       break;
1206     case kTfLiteInt16:
1207       EvalQuantizedPerChannel16x8<kernel_type>(
1208           context, node, params, data, input, filter, bias, output, im2col);
1209       break;
1210     default:
1211       TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
1212                          TfLiteTypeGetName(input->type));
1213       return kTfLiteError;
1214   }
1215   return kTfLiteOk;
1216 }
1217 
1218 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)1219 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
1220   const TfLiteTensor* input;
1221   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1222 
1223   switch (input->type) {
1224     case kTfLiteFloat32:
1225       return EvalImpl<kernel_type, kTfLiteFloat32>(context, node);
1226     case kTfLiteUInt8:
1227       return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
1228     case kTfLiteInt8:
1229       return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
1230     case kTfLiteInt16:
1231       return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
1232     default:
1233       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
1234                          TfLiteTypeGetName(input->type));
1235       return kTfLiteError;
1236   }
1237 }
1238 
1239 }  // namespace conv
1240 
Register_CONVOLUTION_REF()1241 TfLiteRegistration* Register_CONVOLUTION_REF() {
1242   static TfLiteRegistration r = {conv::Init, conv::Free,
1243                                  conv::Prepare<conv::kReference>,
1244                                  conv::Eval<conv::kReference>};
1245   return &r;
1246 }
1247 
Register_CONVOLUTION_GENERIC_OPT()1248 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
1249   static TfLiteRegistration r = {conv::Init, conv::Free,
1250                                  conv::Prepare<conv::kGenericOptimized>,
1251                                  conv::Eval<conv::kGenericOptimized>};
1252   return &r;
1253 }
1254 
Register_CONVOLUTION_GENERIC_OPT_UINT8()1255 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT_UINT8() {
1256   static TfLiteRegistration r = {
1257       conv::Init, conv::Free, conv::Prepare<conv::kGenericOptimized>,
1258       conv::EvalImpl<conv::kGenericOptimized, kTfLiteUInt8>};
1259   return &r;
1260 }
1261 
Register_CONVOLUTION_MULTITHREADED_OPT()1262 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
1263   static TfLiteRegistration r = {conv::Init, conv::Free,
1264                                  conv::Prepare<conv::kMultithreadOptimized>,
1265                                  conv::Eval<conv::kMultithreadOptimized>};
1266   return &r;
1267 }
1268 
Register_CONVOLUTION_CBLAS_OPT()1269 TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
1270   static TfLiteRegistration r = {conv::Init, conv::Free,
1271                                  conv::Prepare<conv::kCblasOptimized>,
1272                                  conv::Eval<conv::kCblasOptimized>};
1273   return &r;
1274 }
1275 
Register_CONV_2D()1276 TfLiteRegistration* Register_CONV_2D() {
1277 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
1278   return Register_CONVOLUTION_CBLAS_OPT();
1279 #elif defined TFLITE_WITH_MULTITHREADED_EIGEN
1280   return Register_CONVOLUTION_MULTITHREADED_OPT();
1281 #else
1282   return Register_CONVOLUTION_GENERIC_OPT();
1283 #endif
1284 }
1285 
1286 // Warning: Clients using this variant are responsible for ensuring that their
1287 // models only need the UINT8 type. TFLite's op registration mechanism doesn't
1288 // yet allow for more nuanced registration mechanisms.
Register_CONV_2D_UINT8()1289 TfLiteRegistration* Register_CONV_2D_UINT8() {
1290 #if defined TFLITE_WITH_RUY
1291   // TFLITE_WITH_RUY optimizes the generic kernel type.
1292   return Register_CONVOLUTION_GENERIC_OPT_UINT8();
1293 #else
1294   return Register_CONV_2D();
1295 #endif
1296 }
1297 
1298 }  // namespace builtin
1299 }  // namespace ops
1300 }  // namespace tflite
1301