1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h"
16
17 #include <stddef.h>
18
19 #include <cstdint>
20 #include <vector>
21
22 // Only use multi-threaded Eigen if ruy is disabled.
23 #if !defined(TFLITE_WITH_RUY)
24 #define TFLITE_WITH_MULTITHREADED_EIGEN
25 #endif
26
27 #include "tensorflow/lite/c/builtin_op_data.h"
28 #include "tensorflow/lite/c/common.h"
29 #include "tensorflow/lite/kernels/cpu_backend_context.h"
30 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
31 #include "tensorflow/lite/kernels/eigen_support.h"
32 #endif
33 #include "tensorflow/lite/kernels/internal/compatibility.h"
34 #include "tensorflow/lite/kernels/internal/types.h"
35 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
36 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
37 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
38 #endif
39 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
40 #include "tensorflow/lite/kernels/internal/quantization_util.h"
41 #include "tensorflow/lite/kernels/internal/reference/conv.h"
42 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
43 #include "tensorflow/lite/kernels/internal/tensor.h"
44 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
45 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
46 #include "tensorflow/lite/kernels/kernel_util.h"
47 #include "tensorflow/lite/kernels/padding.h"
48 #include "tensorflow/lite/util.h"
49
50 namespace tflite {
51 namespace ops {
52 namespace builtin {
53 namespace conv {
54
55 // This file has 4 implementation of Conv.
56 enum KernelType {
57 kReference,
58 kGenericOptimized, // Neon-free
59 // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
60 // are available and kGenericOptimized when we must use only one thread.
61 kMultithreadOptimized,
62 // The kernel uses use CBLAS interface for matrix multiplication.
63 // It's fast when an optimized CBLAS implementation is available (e.g. Apple
64 // Accelerate Framework), and it's slow when falling back to naive
65 // implementation.
66 kCblasOptimized,
67 };
68
69 const int kTensorNotAllocated = -1;
70
71 static constexpr size_t kMaxIm2colBufferSizeMobile = 1024 * 1024 * 1024; // 1GB
72
73 struct OpData {
74 // IDs are the arbitrary identifiers used by TF Lite to identify and access
75 // memory buffers.
76 int im2col_id = kTensorNotAllocated;
77 int hwcn_weights_id = kTensorNotAllocated;
78 int input_quantized_id = kTensorNotAllocated;
79 int scaling_factors_id = kTensorNotAllocated;
80 int input_offset_id = kTensorNotAllocated;
81 int accum_scratch_id = kTensorNotAllocated;
82 // Row sums are used to cache filter sums for hybrid zero-point calculations.
83 int row_sums_id = kTensorNotAllocated;
84
85 TfLitePaddingValues padding;
86 // The scaling factor from input to output (aka the 'real multiplier') can
87 // be represented as a fixed point multiplier plus a left shift.
88 int32_t output_multiplier;
89 int output_shift;
90
91 // Per channel output multiplier and shift.
92 std::vector<int32_t> per_channel_output_multiplier;
93 std::vector<int> per_channel_output_shift;
94
95 // The range of the fused activation layer. For example for kNone and
96 // uint8_t these would be 0 and 255.
97 int32_t output_activation_min;
98 int32_t output_activation_max;
99 // Indexes are the offset to the memory buffer in the array used to keep track
100 // of the allocated temporaries.
101 int32_t im2col_index;
102 int32_t hwcn_weights_index;
103 int32_t input_quantized_index;
104 int32_t scaling_factors_index;
105 int32_t accum_scratch_index;
106 int32_t input_offset_index;
107 int32_t row_sums_index;
108
109 bool need_hwcn_weights = false;
110 bool have_weights_been_transposed = false;
111 bool need_im2col = false;
112 // If it's true, it means im2col is needed but gets disabled because the
113 // temporary im2col tensor requires too much memory (i.e.
114 // >= kMaxIm2colBufferSize);
115 bool im2col_oversized = false;
116
117 bool supports_multithreaded_kernel = false;
118 bool is_hybrid_per_channel = false;
119 bool compute_hybrid_row_sums = true;
120
121 // Number of convolution groups.
122 int32_t groups = 1;
123 };
124
RuntimePaddingType(TfLitePadding padding)125 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
126 switch (padding) {
127 case TfLitePadding::kTfLitePaddingSame:
128 return PaddingType::kSame;
129 case TfLitePadding::kTfLitePaddingValid:
130 return PaddingType::kValid;
131 case TfLitePadding::kTfLitePaddingUnknown:
132 default:
133 return PaddingType::kNone;
134 }
135 }
136
Init(TfLiteContext * context,const char * buffer,size_t length)137 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
138 // This is a builtin op, so we don't use the contents in 'buffer', if any.
139 // Instead, we allocate a new object to use as scratch space for im2col, and
140 // to carry information from Prepare() to Eval().
141 auto* data = new OpData;
142 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
143 eigen_support::IncrementUsageCounter(context);
144 #endif
145 return data;
146 }
147
Free(TfLiteContext * context,void * buffer)148 void Free(TfLiteContext* context, void* buffer) {
149 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
150 eigen_support::DecrementUsageCounter(context);
151 #endif
152 delete reinterpret_cast<OpData*>(buffer);
153 }
154
155 // Naive implementation of transpose for floats. Could be optimized to be more
156 // cache friendly, but for now it's a one-time cost on first run, and we would
157 // prefer to remove the need to do this at all eventually.
TransposeFloatTensor(const TfLiteTensor * input,TfLiteTensor * output)158 void TransposeFloatTensor(const TfLiteTensor* input, TfLiteTensor* output) {
159 const int rows = output->dims->data[1];
160 const int cols = output->dims->data[0];
161 const float* input_data = GetTensorData<float>(input);
162 float* output_data = GetTensorData<float>(output);
163 for (int i = 0; i < rows; ++i) {
164 for (int j = 0; j < cols; ++j) {
165 const float in_value = input_data[i * cols + j];
166 output_data[j * rows + i] = in_value;
167 }
168 }
169 }
170
171 // Check if im2col needs to be allocated, as some version of optimized Conv dont
172 // use it. If any change is supporting im2col in any of the Conv versions, then
173 // it should be updated here as well
IsIm2ColRequired(const TfLiteTensor * input,TfLiteConvParams * params,const TfLiteTensor * filter,OpData * data,bool is_hybrid,KernelType kernel_type)174 bool IsIm2ColRequired(const TfLiteTensor* input, TfLiteConvParams* params,
175 const TfLiteTensor* filter, OpData* data, bool is_hybrid,
176 KernelType kernel_type) {
177 // If HWCN weights are required, Im2Col not required
178 if (data->need_hwcn_weights) return false;
179
180 // segregate based on dilated conv & non-dialated conv
181 const bool need_dilated_im2col =
182 params->dilation_width_factor != 1 || params->dilation_height_factor != 1;
183 const bool need_non_dilated_im2col =
184 params->stride_width != 1 || params->stride_height != 1 ||
185 filter->dims->data[2] != 1 || filter->dims->data[1] != 1;
186
187 const bool need_im2col = need_dilated_im2col || need_non_dilated_im2col;
188
189 // Return early as basic requirement is not met
190 if (!need_im2col) return false;
191
192 // Special case for Hybrid, as it supports only non-dilated im2col currently
193 const bool is_hybrid_non_dilated = is_hybrid && need_non_dilated_im2col;
194 const bool is_quantized = input->type == kTfLiteUInt8 ||
195 input->type == kTfLiteInt8 ||
196 input->type == kTfLiteInt16;
197
198 switch (kernel_type) {
199 case kReference:
200 if (is_hybrid) {
201 return true;
202 } else {
203 return false;
204 }
205 case kGenericOptimized:
206 case kCblasOptimized:
207 if (is_hybrid && !need_non_dilated_im2col) {
208 return false;
209 } else {
210 return true;
211 }
212 case kMultithreadOptimized:
213 if (is_hybrid_non_dilated || is_quantized ||
214 !data->supports_multithreaded_kernel) {
215 return true;
216 } else {
217 return false;
218 }
219 default:
220 return false;
221 }
222 }
223
224 // Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
225 // Note: `context->AddTensors` might invalidate pointers to existing tensors.
226 // Therefore the logic to add tensors are isolated into this function.
AllocateTemporaryTensorsIfRequired(TfLiteContext * context,TfLiteNode * node,bool is_hybrid,bool is_per_channel,KernelType kernel_type,size_t im2col_bytes)227 static TfLiteStatus AllocateTemporaryTensorsIfRequired(
228 TfLiteContext* context, TfLiteNode* node, bool is_hybrid,
229 bool is_per_channel, KernelType kernel_type, size_t im2col_bytes) {
230 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
231 OpData* data = reinterpret_cast<OpData*>(node->user_data);
232
233 TF_LITE_ENSURE(context, node->inputs->size >= 2);
234 const TfLiteTensor* input;
235 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
236 const TfLiteTensor* filter;
237 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
238
239 // If we're using the optimized multithreaded EigenTensor implementation of
240 // convolution, it expects the filter weights to be transposed compared to
241 // the normal TF Lite buffer format. Typical TF Lite weights are
242 // [filter_count, filter_height, filter_width, input_depth], but for the float
243 // implementation we need them as [filter_height, filter_width, input_depth,
244 // filter_count]. We get to that format by transposing, and create a temporary
245 // buffer to store the results.
246 // This path is only used for float processing, so only create the buffer if
247 // we're running with that data type.
248 data->need_hwcn_weights =
249 input->type == kTfLiteFloat32 && data->supports_multithreaded_kernel;
250
251 // We don't always need to allocate im2col. It is only used in some versions
252 // of the optimized Conv. This test just mimics something that happens inside
253 // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
254 data->need_im2col =
255 IsIm2ColRequired(input, params, filter, data, is_hybrid, kernel_type);
256
257 // If im2col_oversized is found to be true, we have to fallback to an
258 // execution path (like kReference in float/quantized cases) that doesn't
259 // require im2col operation. Therefore, we have to skip checking the hybrid
260 // case (but not the hybrid-per-channel one) where there's no such a fallback
261 // execution path.
262 // TODO(b/178743262): Consider making this check conditioned on the available
263 // memory of the system, rather than coupling to the mobile platform check.
264 if (IsMobilePlatform() && !(is_hybrid && !is_per_channel) &&
265 data->need_im2col && im2col_bytes >= kMaxIm2colBufferSizeMobile) {
266 data->need_im2col = false;
267 data->im2col_oversized = true;
268 }
269 int temporaries_count = 0;
270 if (data->need_im2col) {
271 data->im2col_index = temporaries_count;
272 if (data->im2col_id == kTensorNotAllocated) {
273 context->AddTensors(context, 1, &data->im2col_id);
274 }
275 ++temporaries_count;
276 }
277 if (data->need_hwcn_weights) {
278 data->hwcn_weights_index = temporaries_count;
279 if (data->hwcn_weights_id == kTensorNotAllocated) {
280 context->AddTensors(context, 1, &data->hwcn_weights_id);
281 }
282 ++temporaries_count;
283 }
284
285 if (is_hybrid) {
286 // Allocate tensor to store the on-the-fly quantized inputs.
287 data->input_quantized_index = temporaries_count;
288 if (data->input_quantized_id == kTensorNotAllocated) {
289 TF_LITE_ENSURE_OK(
290 context, context->AddTensors(context, 1, &data->input_quantized_id));
291 }
292 ++temporaries_count;
293
294 // Allocate tensor to store the quantization params computed during
295 // on-the-fly input quantization.
296 data->scaling_factors_index = temporaries_count;
297 if (data->scaling_factors_id == kTensorNotAllocated) {
298 TF_LITE_ENSURE_OK(
299 context, context->AddTensors(context, 1, &data->scaling_factors_id));
300 }
301 ++temporaries_count;
302
303 // Allocate tensor to store the accumulators for the matrix multiply.
304 data->accum_scratch_index = temporaries_count;
305 if (data->accum_scratch_id == kTensorNotAllocated) {
306 TF_LITE_ENSURE_OK(
307 context, context->AddTensors(context, 1, &data->accum_scratch_id));
308 }
309 ++temporaries_count;
310 if (is_per_channel) {
311 data->input_offset_index = temporaries_count;
312 if (data->input_offset_id == kTensorNotAllocated) {
313 TF_LITE_ENSURE_OK(
314 context, context->AddTensors(context, 1, &data->input_offset_id));
315 }
316 ++temporaries_count;
317
318 data->row_sums_index = temporaries_count;
319 if (data->row_sums_id == kTensorNotAllocated) {
320 TF_LITE_ENSURE_OK(context,
321 context->AddTensors(context, 1, &data->row_sums_id));
322 }
323 ++temporaries_count;
324 }
325 }
326
327 TfLiteIntArrayFree(node->temporaries);
328 node->temporaries = TfLiteIntArrayCreate(temporaries_count);
329
330 return kTfLiteOk;
331 }
332
Prepare(KernelType kernel_type,TfLiteContext * context,TfLiteNode * node)333 TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
334 TfLiteNode* node) {
335 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
336 OpData* data = reinterpret_cast<OpData*>(node->user_data);
337
338 bool has_bias = node->inputs->size == 3;
339 // Check number of inputs/outputs
340 TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
341 TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
342 TfLiteTensor* output;
343 TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
344 const TfLiteTensor* input;
345 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
346 const TfLiteTensor* filter;
347 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
348
349 // Check dimensionality of input, filter
350 TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
351 TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
352 // Check input channels matching filter
353 // Filter input channel can be a factor of channels of input (grouped conv)
354 // or equals (normal conv).
355 auto input_channel = input->dims->data[3];
356 auto filter_input_channel = filter->dims->data[3];
357 TF_LITE_ENSURE_EQ(context, input_channel % filter_input_channel, 0);
358 data->groups = input_channel / filter_input_channel;
359
360 // Check types. (We assume that UINT8 refers to quantized tensors)
361 TfLiteType input_type = input->type;
362 TF_LITE_ENSURE(context,
363 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
364 input_type == kTfLiteInt8 || input_type == kTfLiteInt16);
365 TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
366
367 if (input_type == kTfLiteInt16) {
368 TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
369 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
370 }
371 // Filter must have zero zero-points in per-channel quantization.
372 if (input_type == kTfLiteInt16 || input_type == kTfLiteInt8) {
373 TF_LITE_ENSURE_EQ(context, filter->quantization.type,
374 kTfLiteAffineQuantization);
375 const auto* affine_quantization =
376 reinterpret_cast<TfLiteAffineQuantization*>(
377 filter->quantization.params);
378 for (int i = 0; i < affine_quantization->zero_point->size; ++i) {
379 TF_LITE_ENSURE_EQ(context, affine_quantization->zero_point->data[i], 0);
380 }
381 }
382
383 const TfLiteTensor* bias = nullptr;
384
385 // TODO(ahentz): At this point the optimized versions require 'bias'. We can
386 // either change that or document that convolution requires it.
387 TF_LITE_ENSURE(context, has_bias);
388
389 if (has_bias) {
390 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &bias));
391 if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
392 TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
393 TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
394 } else if (input_type == kTfLiteInt16) {
395 TF_LITE_ENSURE(context, (bias->type == kTfLiteInt32) ||
396 (bias->type == kTfLiteInt64));
397 TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
398 } else {
399 TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type);
400 }
401 TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
402 }
403
404 const bool is_hybrid =
405 (input->type == kTfLiteFloat32 &&
406 (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
407
408 if (is_hybrid && filter->type == kTfLiteInt8 &&
409 filter->quantization.type == kTfLiteAffineQuantization &&
410 filter->quantization.params &&
411 reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
412 ->scale &&
413 reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
414 ->scale->size > 1) {
415 const auto* affine_quantization =
416 reinterpret_cast<TfLiteAffineQuantization*>(
417 filter->quantization.params);
418 const float scale = affine_quantization->scale->data[0];
419 for (int i = 1; i < affine_quantization->scale->size; i++) {
420 if (affine_quantization->scale->data[i] != scale) {
421 data->is_hybrid_per_channel = true;
422 break;
423 }
424 }
425 }
426
427 // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
428 // is incompatible with mutable input filters that might change between evals.
429 data->supports_multithreaded_kernel =
430 (kernel_type == kMultithreadOptimized) &&
431 (context->recommended_num_threads != 1) && !is_hybrid &&
432 (params->dilation_width_factor == 1) &&
433 (params->dilation_height_factor == 1) &&
434 (filter->allocation_type != kTfLiteArenaRw) && !IsDynamicTensor(filter);
435
436 int channels_in = filter->dims->data[3];
437 int channels_out = filter->dims->data[0];
438 int width = input->dims->data[2];
439 int height = input->dims->data[1];
440 int filter_width = filter->dims->data[2];
441 int filter_height = filter->dims->data[1];
442 int batches = input->dims->data[0];
443
444 // Matching GetWindowedOutputSize in TensorFlow.
445 auto padding = params->padding;
446 int out_width, out_height;
447 data->padding = ComputePaddingHeightWidth(
448 params->stride_height, params->stride_width,
449 params->dilation_height_factor, params->dilation_width_factor, height,
450 width, filter_height, filter_width, padding, &out_height, &out_width);
451
452 size_t im2col_type_size;
453 TF_LITE_ENSURE_STATUS(GetSizeOfType(context, input->type, &im2col_type_size));
454 // Note that we intentionally promote the first multiplicand (i.e. 'batches')
455 // to 'size_t' to avoid integer overflow here.
456 const size_t im2col_bytes = static_cast<size_t>(batches) * out_height *
457 out_width * channels_in * filter_height *
458 filter_width * im2col_type_size;
459 TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
460 context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type,
461 im2col_bytes));
462
463 TF_LITE_ENSURE(context, has_bias);
464
465 // Note that full fixed-point inference requires that all tensors have their
466 // parameters set. This is usually done during quantized training or
467 // calibration.
468 if (input_type != kTfLiteFloat32) {
469 TF_LITE_ENSURE_EQ(context, filter->quantization.type,
470 kTfLiteAffineQuantization);
471 const auto* affine_quantization =
472 reinterpret_cast<TfLiteAffineQuantization*>(
473 filter->quantization.params);
474 TF_LITE_ENSURE(context, affine_quantization);
475 TF_LITE_ENSURE(context, affine_quantization->scale);
476 TF_LITE_ENSURE(context, (affine_quantization->scale->size == 1 ||
477 affine_quantization->scale->size == channels_out));
478
479 data->per_channel_output_multiplier.resize(channels_out);
480 data->per_channel_output_shift.resize(channels_out);
481 TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
482 context, input, filter, bias, output, params->activation,
483 &data->output_multiplier, &data->output_shift,
484 &data->output_activation_min, &data->output_activation_max,
485 data->per_channel_output_multiplier.data(),
486 data->per_channel_output_shift.data(), channels_out));
487 }
488
489 TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
490 output_size->data[0] = batches;
491 output_size->data[1] = out_height;
492 output_size->data[2] = out_width;
493 output_size->data[3] = channels_out;
494 auto output_status = context->ResizeTensor(context, output, output_size);
495
496 if (output_status != kTfLiteOk) return output_status;
497
498 if (data->need_im2col) {
499 node->temporaries->data[data->im2col_index] = data->im2col_id;
500
501 TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
502
503 auto filter_input_channel = filter->dims->data[3];
504 im2col_size->data[0] = output_size->data[0];
505 im2col_size->data[1] = output_size->data[1];
506 im2col_size->data[2] = output_size->data[2];
507 im2col_size->data[3] = filter_input_channel * filter_height * filter_width;
508
509 TfLiteTensor* im2col =
510 &context->tensors[node->temporaries->data[data->im2col_index]];
511 im2col->type = input->type;
512 if (is_hybrid) {
513 im2col->type = filter->type;
514 }
515 im2col->allocation_type = kTfLiteArenaRw;
516 auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
517 if (im2col_status != kTfLiteOk) return im2col_status;
518 }
519
520 if (data->need_hwcn_weights) {
521 node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
522 TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
523
524 // Because we're treating the filter weights as a matrix when we do the
525 // transpose, we allocate the buffer with a two-dimensional shape, where one
526 // dimension is the number of elements in each filter, and the second is the
527 // total number of filters.
528 auto filter_input_channel = filter->dims->data[3];
529 hwcn_weights_size->data[0] =
530 (filter_height * filter_width * filter_input_channel);
531 hwcn_weights_size->data[1] = channels_out;
532
533 TfLiteTensor* hwcn_weights =
534 &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
535 hwcn_weights->type = input_type;
536 hwcn_weights->name = "Conv_hwcn_weights";
537 hwcn_weights->allocation_type = kTfLiteArenaRwPersistent;
538
539 auto hwcn_weights_status =
540 context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
541 if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
542
543 // TODO(petewarden): If Resize() is called when the size hasn't actually
544 // changed, this will do extra redundant work.
545 data->have_weights_been_transposed = false;
546 }
547
548 if (is_hybrid) {
549 node->temporaries->data[data->input_quantized_index] =
550 data->input_quantized_id;
551 TfLiteTensor* input_quantized;
552 TF_LITE_ENSURE_OK(
553 context, GetTemporarySafe(context, node, data->input_quantized_index,
554 &input_quantized));
555 input_quantized->type = kTfLiteInt8;
556 input_quantized->allocation_type = kTfLiteArenaRw;
557 if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
558 TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
559 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
560 input_quantized_size));
561 }
562
563 node->temporaries->data[data->scaling_factors_index] =
564 data->scaling_factors_id;
565 TfLiteTensor* scaling_factors;
566 TF_LITE_ENSURE_OK(
567 context, GetTemporarySafe(context, node, data->scaling_factors_index,
568 &scaling_factors));
569 scaling_factors->type = kTfLiteFloat32;
570 scaling_factors->allocation_type = kTfLiteArenaRw;
571 // Only one scale factor per batch is typically necessary. See optimized
572 // implementation for why we need to allocate for the height of the inputs
573 // flattened to 2D.
574 TF_LITE_ENSURE(context, channels_in != 0);
575 const int height = NumElements(input) / channels_in;
576 int scaling_dims[1] = {height};
577 if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
578 TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
579 scaling_factors_size->data[0] = height;
580 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
581 scaling_factors_size));
582 }
583
584 node->temporaries->data[data->accum_scratch_index] = data->accum_scratch_id;
585 TfLiteTensor* accum_scratch;
586 TF_LITE_ENSURE_OK(context,
587 GetTemporarySafe(context, node, data->accum_scratch_index,
588 &accum_scratch));
589 accum_scratch->type = kTfLiteInt32;
590 accum_scratch->allocation_type = kTfLiteArenaRw;
591 const int scratch_width = batches * out_height * out_width;
592 int accum_scratch_dims[2] = {channels_out, scratch_width};
593 if (!TfLiteIntArrayEqualsArray(accum_scratch->dims, 2,
594 accum_scratch_dims)) {
595 TfLiteIntArray* accum_scratch_size = TfLiteIntArrayCreate(2);
596 accum_scratch_size->data[0] = channels_out;
597 accum_scratch_size->data[1] = scratch_width;
598 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, accum_scratch,
599 accum_scratch_size));
600 }
601
602 if (data->is_hybrid_per_channel) {
603 const auto* affine_quantization =
604 reinterpret_cast<TfLiteAffineQuantization*>(
605 filter->quantization.params);
606 TF_LITE_ENSURE_EQ(
607 context, affine_quantization->scale->size,
608 filter->dims->data[affine_quantization->quantized_dimension]);
609 node->temporaries->data[data->input_offset_index] = data->input_offset_id;
610 TfLiteTensor* input_offsets;
611 TF_LITE_ENSURE_OK(
612 context, GetTemporarySafe(context, node, data->input_offset_index,
613 &input_offsets));
614 input_offsets->type = kTfLiteInt32;
615 input_offsets->allocation_type = kTfLiteArenaRw;
616 // See above comment for the need to allocate for height of inputs.
617 TF_LITE_ENSURE(context, channels_in != 0);
618 const int height = NumElements(input) / channels_in;
619 const int input_offset_dims[1] = {height};
620 if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1,
621 input_offset_dims)) {
622 TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1);
623 input_offsets_size->data[0] = input_offset_dims[0];
624 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
625 input_offsets_size));
626 }
627 node->temporaries->data[data->row_sums_index] = data->row_sums_id;
628 TfLiteTensor* row_sums;
629 TF_LITE_ENSURE_OK(
630 context,
631 GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
632 row_sums->type = kTfLiteInt32;
633 row_sums->name = "Conv_row_sums";
634 row_sums->allocation_type = kTfLiteArenaRwPersistent;
635 // See above comment for the need to allocate for height of inputs.
636 const int row_sums_dims[1] = {channels_out};
637 if (!TfLiteIntArrayEqualsArray(row_sums->dims, 1, row_sums_dims)) {
638 TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(1);
639 row_sums_size->data[0] = row_sums_dims[0];
640 TF_LITE_ENSURE_OK(
641 context, context->ResizeTensor(context, row_sums, row_sums_size));
642 }
643 }
644 }
645 return kTfLiteOk;
646 }
647
648 template <KernelType kernel_type>
Prepare(TfLiteContext * context,TfLiteNode * node)649 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
650 return Prepare(kernel_type, context, node);
651 }
652
653 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * output)654 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
655 TfLiteConvParams* params, OpData* data,
656 const TfLiteTensor* input, const TfLiteTensor* filter,
657 const TfLiteTensor* bias, TfLiteTensor* im2col,
658 TfLiteTensor* output) {
659 auto input_offset = -input->params.zero_point;
660 auto filter_offset = -filter->params.zero_point;
661 auto output_offset = output->params.zero_point;
662
663 KernelType effective_kernel_type;
664 if ((kernel_type == kMultithreadOptimized ||
665 kernel_type == kCblasOptimized) &&
666 (params->dilation_width_factor != 1 ||
667 params->dilation_height_factor != 1)) {
668 // kMultithreadOptimized and kCblasOptimized do not support dilation.
669 // Therefore, fallback to optimized.
670 effective_kernel_type = kGenericOptimized;
671 } else {
672 effective_kernel_type = kernel_type;
673 }
674
675 // We have to fallback to reference execution path when im2col is needed but
676 // disabled because to-be-allocated temporary im2col tensor is too large.
677 // See b/178743262 for the detailed motivation.
678 if (data->im2col_oversized) {
679 effective_kernel_type = kReference;
680 }
681
682 // Grouped convolution is right now only supported on reference kernel.
683 if (data->groups != 1) {
684 effective_kernel_type = kReference;
685 }
686
687 ConvParams op_params;
688 op_params.padding_type = PaddingType::kSame;
689 op_params.padding_values.width = data->padding.width;
690 op_params.padding_values.height = data->padding.height;
691 op_params.dilation_width_factor = params->dilation_width_factor;
692 op_params.dilation_height_factor = params->dilation_height_factor;
693 op_params.stride_width = params->stride_width;
694 op_params.stride_height = params->stride_height;
695 op_params.input_offset = input_offset;
696 op_params.weights_offset = filter_offset;
697 op_params.output_offset = output_offset;
698 op_params.output_multiplier = data->output_multiplier;
699 op_params.output_shift = -data->output_shift;
700 op_params.quantized_activation_min = data->output_activation_min;
701 op_params.quantized_activation_max = data->output_activation_max;
702 switch (effective_kernel_type) {
703 case kReference: {
704 reference_ops::Conv(
705 op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
706 GetTensorShape(filter), GetTensorData<uint8_t>(filter),
707 GetTensorShape(bias), GetTensorData<int32_t>(bias),
708 GetTensorShape(output), GetTensorData<uint8_t>(output),
709 GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
710 /* cpu_backend_context = */ nullptr);
711 break;
712 }
713 case kGenericOptimized:
714 case kMultithreadOptimized:
715 case kCblasOptimized: {
716 // There is only one optimized implementation for Quantized Conv.
717 optimized_ops::Conv(
718 op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
719 GetTensorShape(filter), GetTensorData<uint8_t>(filter),
720 GetTensorShape(bias), GetTensorData<int32_t>(bias),
721 GetTensorShape(output), GetTensorData<uint8_t>(output),
722 GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
723 CpuBackendContext::GetFromContext(context));
724 break;
725 }
726 }
727 }
728
729 template <KernelType kernel_type>
EvalQuantizedPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output,TfLiteTensor * im2col)730 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
731 TfLiteConvParams* params, OpData* data,
732 const TfLiteTensor* input,
733 const TfLiteTensor* filter,
734 const TfLiteTensor* bias, TfLiteTensor* output,
735 TfLiteTensor* im2col) {
736 ConvParams op_params;
737 op_params.input_offset = -input->params.zero_point;
738 op_params.output_offset = output->params.zero_point;
739 op_params.stride_height = params->stride_height;
740 op_params.stride_width = params->stride_width;
741 op_params.dilation_height_factor = params->dilation_height_factor;
742 op_params.dilation_width_factor = params->dilation_width_factor;
743 op_params.padding_values.height = data->padding.height;
744 op_params.padding_values.width = data->padding.width;
745 op_params.quantized_activation_min = data->output_activation_min;
746 op_params.quantized_activation_max = data->output_activation_max;
747
748 KernelType effective_kernel_type = kernel_type;
749 // We have to fallback to reference execution path when im2col is needed but
750 // disabled because to-be-allocated temporary im2col tensor is too large.
751 // See b/178743262 for the detailed motivation.
752 if (data->im2col_oversized) {
753 effective_kernel_type = kReference;
754 }
755
756 // Grouped convolution is right now only supported on reference kernel.
757 if (data->groups != 1) {
758 effective_kernel_type = kReference;
759 }
760
761 switch (effective_kernel_type) {
762 case kReference: {
763 reference_integer_ops::ConvPerChannel(
764 op_params, data->per_channel_output_multiplier.data(),
765 data->per_channel_output_shift.data(), GetTensorShape(input),
766 GetTensorData<int8>(input), GetTensorShape(filter),
767 GetTensorData<int8>(filter), GetTensorShape(bias),
768 GetTensorData<int32>(bias), GetTensorShape(output),
769 GetTensorData<int8>(output));
770 break;
771 }
772 case kGenericOptimized:
773 case kMultithreadOptimized:
774 case kCblasOptimized: {
775 optimized_integer_ops::ConvPerChannel(
776 op_params, data->per_channel_output_multiplier.data(),
777 data->per_channel_output_shift.data(), GetTensorShape(input),
778 GetTensorData<int8>(input), GetTensorShape(filter),
779 GetTensorData<int8>(filter), GetTensorShape(bias),
780 GetTensorData<int32>(bias), GetTensorShape(output),
781 GetTensorData<int8>(output), GetTensorShape(im2col),
782 GetTensorData<int8>(im2col),
783 CpuBackendContext::GetFromContext(context));
784 break;
785 }
786 }
787 }
788
789 template <KernelType kernel_type>
EvalQuantizedPerChannel16x8(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output,TfLiteTensor * im2col)790 void EvalQuantizedPerChannel16x8(TfLiteContext* context, TfLiteNode* node,
791 TfLiteConvParams* params, OpData* data,
792 const TfLiteTensor* input,
793 const TfLiteTensor* filter,
794 const TfLiteTensor* bias, TfLiteTensor* output,
795 TfLiteTensor* im2col) {
796 ConvParams op_params;
797 op_params.input_offset = -input->params.zero_point;
798 op_params.output_offset = output->params.zero_point;
799 op_params.stride_height = params->stride_height;
800 op_params.stride_width = params->stride_width;
801 op_params.dilation_height_factor = params->dilation_height_factor;
802 op_params.dilation_width_factor = params->dilation_width_factor;
803 op_params.padding_values.height = data->padding.height;
804 op_params.padding_values.width = data->padding.width;
805 op_params.quantized_activation_min = data->output_activation_min;
806 op_params.quantized_activation_max = data->output_activation_max;
807
808 KernelType effective_kernel_type = kernel_type;
809 // We have to fallback to reference execution path when im2col is needed but
810 // disabled because to-be-allocated temporary im2col tensor is too large.
811 // See b/178743262 for the detailed motivation.
812 if (data->im2col_oversized) {
813 effective_kernel_type = kReference;
814 }
815
816 // Grouped convolution is right now only supported on reference kernel.
817 if (data->groups != 1) {
818 effective_kernel_type = kReference;
819 }
820
821 // To prevent 32bit accum overflow for 16x8 quantization, it enables the
822 // optimized path only when zero_point is 0.
823 bool has_non_zero_point = input->params.zero_point ||
824 filter->params.zero_point ||
825 output->params.zero_point;
826
827 // Fallback to reference kernel when bias_type is int64 as
828 // there is no optimized kernel for int64 bias yet.
829 if (bias && bias->type == kTfLiteInt64) {
830 reference_integer_ops::ConvPerChannel(
831 op_params, data->per_channel_output_multiplier.data(),
832 data->per_channel_output_shift.data(), GetTensorShape(input),
833 GetTensorData<int16>(input), GetTensorShape(filter),
834 GetTensorData<int8>(filter), GetTensorShape(bias),
835 GetTensorData<std::int64_t>(bias), GetTensorShape(output),
836 GetTensorData<int16>(output));
837 } else if (effective_kernel_type == kReference || has_non_zero_point) {
838 reference_integer_ops::ConvPerChannel(
839 op_params, data->per_channel_output_multiplier.data(),
840 data->per_channel_output_shift.data(), GetTensorShape(input),
841 GetTensorData<int16>(input), GetTensorShape(filter),
842 GetTensorData<int8>(filter), GetTensorShape(bias),
843 GetTensorData<std::int32_t>(bias), GetTensorShape(output),
844 GetTensorData<int16>(output));
845 } else {
846 optimized_integer_ops::ConvPerChannel(
847 op_params, data->per_channel_output_multiplier.data(),
848 data->per_channel_output_shift.data(), GetTensorShape(input),
849 GetTensorData<int16_t>(input), GetTensorShape(filter),
850 GetTensorData<int8_t>(filter), GetTensorShape(bias),
851 GetTensorData<std::int32_t>(bias), GetTensorShape(output),
852 GetTensorData<int16_t>(output), GetTensorShape(im2col),
853 GetTensorData<int16_t>(im2col),
854 CpuBackendContext::GetFromContext(context));
855 }
856 }
857
858 template <KernelType kernel_type>
EvalFloat(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)859 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
860 TfLiteConvParams* params, OpData* data,
861 const TfLiteTensor* input, const TfLiteTensor* filter,
862 const TfLiteTensor* bias, TfLiteTensor* im2col,
863 TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
864 float output_activation_min, output_activation_max;
865 CalculateActivationRange(params->activation, &output_activation_min,
866 &output_activation_max);
867 KernelType effective_kernel_type = kernel_type;
868 // Fall back to the optimized path if multi-threaded conv is unsupported.
869 if ((kernel_type == kMultithreadOptimized) &&
870 !data->supports_multithreaded_kernel) {
871 effective_kernel_type = kGenericOptimized;
872 }
873
874 // When im2col is needed (which is implied when 'im2col_oversized' is true),
875 // the GEMMM-based optimized path requires im2col data be allocated to ensure
876 // the correctness. Therefore, when im2col is disabled because of the
877 // oversized temporary im2col tensor, fallback to a non-optimized path is
878 // needed.
879 // See b/178743262 for the detailed motivation.
880 if (data->im2col_oversized) {
881 effective_kernel_type = kReference;
882 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
883 // As detailed by tflite::multithreaded_ops::Conv implementation in
884 // multithreaded_conv.h, the Eigen-based execution doesn't need im2col data.
885 // Therefore, we could rely on it as a better-optimized fallback than the
886 // reference one.
887 if (data->supports_multithreaded_kernel) {
888 effective_kernel_type = kMultithreadOptimized;
889 }
890 #endif
891 }
892
893 // Grouped convolution is right now only supported on reference kernel.
894 if (data->groups != 1) {
895 effective_kernel_type = kReference;
896 }
897
898 ConvParams op_params;
899 op_params.padding_type = RuntimePaddingType(params->padding);
900 op_params.padding_values.width = data->padding.width;
901 op_params.padding_values.height = data->padding.height;
902 op_params.stride_width = params->stride_width;
903 op_params.stride_height = params->stride_height;
904 op_params.dilation_width_factor = params->dilation_width_factor;
905 op_params.dilation_height_factor = params->dilation_height_factor;
906 op_params.float_activation_min = output_activation_min;
907 op_params.float_activation_max = output_activation_max;
908 switch (effective_kernel_type) {
909 case kReference: {
910 reference_ops::Conv(op_params, GetTensorShape(input),
911 GetTensorData<float>(input), GetTensorShape(filter),
912 GetTensorData<float>(filter), GetTensorShape(bias),
913 GetTensorData<float>(bias), GetTensorShape(output),
914 GetTensorData<float>(output), GetTensorShape(im2col),
915 GetTensorData<float>(im2col));
916 break;
917 }
918 case kCblasOptimized:
919 case kGenericOptimized: {
920 optimized_ops::Conv(op_params, GetTensorShape(input),
921 GetTensorData<float>(input), GetTensorShape(filter),
922 GetTensorData<float>(filter), GetTensorShape(bias),
923 GetTensorData<float>(bias), GetTensorShape(output),
924 GetTensorData<float>(output), GetTensorShape(im2col),
925 GetTensorData<float>(im2col),
926 CpuBackendContext::GetFromContext(context));
927 break;
928 }
929 case kMultithreadOptimized: {
930 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
931 const float* filter_data;
932 if (data->need_hwcn_weights) {
933 filter_data = GetTensorData<float>(hwcn_weights);
934 } else {
935 filter_data = GetTensorData<float>(filter);
936 }
937 multithreaded_ops::Conv(
938 *eigen_support::GetThreadPoolDevice(context), op_params,
939 GetTensorShape(input), GetTensorData<float>(input),
940 GetTensorShape(filter), filter_data, GetTensorShape(bias),
941 GetTensorData<float>(bias), GetTensorShape(output),
942 GetTensorData<float>(output), GetTensorShape(im2col),
943 GetTensorData<float>(im2col));
944 break;
945 #else // !defined(TFLITE_WITH_MULTITHREADED_EIGEN)
946 // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
947 // was enabled. We #if out this code in order to get the corresponding
948 // binary size benefits.
949 TFLITE_DCHECK(false);
950 #endif // defined(TFLITE_WITH_MULTITHREADED_EIGEN)
951 }
952 }
953 }
954
955 template <KernelType kernel_type>
EvalHybridPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * output)956 TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
957 TfLiteConvParams* params, OpData* data,
958 const TfLiteTensor* input,
959 const TfLiteTensor* filter,
960 const TfLiteTensor* bias,
961 TfLiteTensor* im2col, TfLiteTensor* output) {
962 float output_activation_min, output_activation_max;
963 CalculateActivationRange(params->activation, &output_activation_min,
964 &output_activation_max);
965
966 const int batch_size = SizeOfDimension(input, 0);
967 TF_LITE_ENSURE(context, batch_size != 0);
968 const int input_size = NumElements(input) / batch_size;
969 TfLiteTensor* quantized_input_tensor;
970 TF_LITE_ENSURE_OK(context,
971 GetTemporarySafe(context, node, data->input_quantized_index,
972 &quantized_input_tensor));
973 int8_t* quantized_input_ptr_batch =
974 GetTensorData<int8_t>(quantized_input_tensor);
975 TfLiteTensor* scaling_factors_tensor;
976 TF_LITE_ENSURE_OK(context,
977 GetTemporarySafe(context, node, data->scaling_factors_index,
978 &scaling_factors_tensor));
979 float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
980 TfLiteTensor* input_offset_tensor;
981 TF_LITE_ENSURE_OK(context,
982 GetTemporarySafe(context, node, data->input_offset_index,
983 &input_offset_tensor));
984 int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
985
986 for (int b = 0; b < batch_size; ++b) {
987 const int offset = b * input_size;
988 tensor_utils::AsymmetricQuantizeFloats(
989 GetTensorData<float>(input) + offset, input_size,
990 quantized_input_ptr_batch + offset, &scaling_factors_ptr[b],
991 &input_offset_ptr[b]);
992 }
993
994 int8_t* im2col_ptr = nullptr;
995 int8_t* filter_ptr = nullptr;
996 if (im2col != nullptr) {
997 im2col_ptr = im2col->data.int8;
998 }
999 filter_ptr = filter->data.int8;
1000 const auto* affine_quantization =
1001 reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
1002
1003 KernelType effective_kernel_type = kernel_type;
1004 // We have to fallback to reference execution path when im2col is needed but
1005 // disabled because to-be-allocated temporary im2col tensor is too large.
1006 // See b/178743262 for the detailed motivation.
1007 if (data->im2col_oversized) {
1008 effective_kernel_type = kReference;
1009 }
1010
1011 // Grouped convolution is right now only supported on reference kernel.
1012 if (data->groups != 1) {
1013 effective_kernel_type = kReference;
1014 }
1015
1016 ConvParams op_params;
1017 op_params.padding_type = PaddingType::kSame;
1018 op_params.padding_values.width = data->padding.width;
1019 op_params.padding_values.height = data->padding.height;
1020 op_params.dilation_width_factor = params->dilation_width_factor;
1021 op_params.dilation_height_factor = params->dilation_height_factor;
1022 op_params.stride_width = params->stride_width;
1023 op_params.stride_height = params->stride_height;
1024 op_params.float_activation_min = output_activation_min;
1025 op_params.float_activation_max = output_activation_max;
1026 switch (effective_kernel_type) {
1027 case kReference:
1028 reference_ops::HybridConvPerChannel(
1029 op_params, scaling_factors_ptr, GetTensorShape(input),
1030 quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1031 GetTensorShape(bias), GetTensorData<float>(bias),
1032 GetTensorShape(output), GetTensorData<float>(output),
1033 GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1034 input_offset_ptr);
1035 break;
1036 case kGenericOptimized:
1037 case kMultithreadOptimized:
1038 case kCblasOptimized: {
1039 TfLiteTensor* row_sums;
1040 TF_LITE_ENSURE_OK(
1041 context,
1042 GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
1043 TfLiteTensor* scratch;
1044 TF_LITE_ENSURE_OK(
1045 context,
1046 GetTemporarySafe(context, node, data->accum_scratch_index, &scratch));
1047 optimized_ops::HybridConvPerChannel(
1048 op_params, scaling_factors_ptr, GetTensorShape(input),
1049 quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1050 GetTensorShape(bias), GetTensorData<float>(bias),
1051 GetTensorShape(output), GetTensorData<float>(output),
1052 GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1053 input_offset_ptr, GetTensorShape(scratch),
1054 GetTensorData<int32>(scratch), GetTensorData<int32_t>(row_sums),
1055 &data->compute_hybrid_row_sums,
1056 CpuBackendContext::GetFromContext(context));
1057 data->compute_hybrid_row_sums = false;
1058 break;
1059 }
1060 }
1061
1062 return kTfLiteOk;
1063 }
1064
1065 template <KernelType kernel_type>
EvalHybrid(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * accum_scratch,TfLiteTensor * output)1066 TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
1067 TfLiteConvParams* params, OpData* data,
1068 const TfLiteTensor* input, const TfLiteTensor* filter,
1069 const TfLiteTensor* bias, TfLiteTensor* im2col,
1070 TfLiteTensor* accum_scratch, TfLiteTensor* output) {
1071 float output_activation_min, output_activation_max;
1072 CalculateActivationRange(params->activation, &output_activation_min,
1073 &output_activation_max);
1074
1075 const int batch_size = SizeOfDimension(input, 0);
1076 TF_LITE_ENSURE(context, batch_size != 0);
1077 const int input_size = NumElements(input) / batch_size;
1078
1079 const float* input_ptr = GetTensorData<float>(input);
1080 TfLiteTensor* quantized_input_tensor;
1081 TF_LITE_ENSURE_OK(context,
1082 GetTemporarySafe(context, node, data->input_quantized_index,
1083 &quantized_input_tensor));
1084 int8_t* quantized_input_ptr_batch =
1085 GetTensorData<int8_t>(quantized_input_tensor);
1086 TfLiteTensor* scaling_factors_tensor;
1087 TF_LITE_ENSURE_OK(context,
1088 GetTemporarySafe(context, node, data->scaling_factors_index,
1089 &scaling_factors_tensor));
1090 float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
1091
1092 // Per-batch input quantization for higher accuracy.
1093 {
1094 ruy::profiler::ScopeLabel label("ConvHybridQuantizeInputs");
1095 for (int b = 0; b < batch_size; ++b) {
1096 float unused_min, unused_max;
1097 const int offset = b * input_size;
1098 tensor_utils::SymmetricQuantizeFloats(
1099 input_ptr + offset, input_size, quantized_input_ptr_batch + offset,
1100 &unused_min, &unused_max, &scaling_factors_ptr[b]);
1101 scaling_factors_ptr[b] *= filter->params.scale;
1102 }
1103 }
1104
1105 switch (kernel_type) {
1106 case kReference:
1107 case kGenericOptimized:
1108 case kMultithreadOptimized:
1109 case kCblasOptimized: {
1110 // There is only one implementation for hybrid kernel.
1111 ConvParams op_params;
1112 op_params.padding_type = PaddingType::kSame;
1113 op_params.padding_values.width = data->padding.width;
1114 op_params.padding_values.height = data->padding.height;
1115 op_params.stride_width = params->stride_width;
1116 op_params.stride_height = params->stride_height;
1117 op_params.dilation_width_factor = params->dilation_width_factor;
1118 op_params.dilation_height_factor = params->dilation_height_factor;
1119 op_params.float_activation_min = output_activation_min;
1120 op_params.float_activation_max = output_activation_max;
1121 if (data->groups == 1) {
1122 optimized_ops::HybridConv(
1123 op_params, scaling_factors_ptr, GetTensorShape(input),
1124 quantized_input_ptr_batch, GetTensorShape(filter),
1125 GetTensorData<int8_t>(filter), GetTensorShape(bias),
1126 GetTensorData<float>(bias), GetTensorShape(accum_scratch),
1127 GetTensorData<int32_t>(accum_scratch), GetTensorShape(output),
1128 GetTensorData<float>(output), GetTensorShape(im2col),
1129 GetTensorData<int8_t>(im2col),
1130 CpuBackendContext::GetFromContext(context));
1131 } else {
1132 // This case is handled by (fallbacked to) per channel hybrid group conv
1133 // and shouldn't hit this branch.
1134 TF_LITE_KERNEL_LOG(
1135 context,
1136 "Group convolution currently not supported for hybrid kernel.");
1137 return kTfLiteError;
1138 }
1139 break;
1140 }
1141 }
1142
1143 return kTfLiteOk;
1144 }
1145
1146 template <KernelType kernel_type, TfLiteType input_type>
EvalImpl(TfLiteContext * context,TfLiteNode * node)1147 TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
1148 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
1149 OpData* data = reinterpret_cast<OpData*>(node->user_data);
1150
1151 TfLiteTensor* output;
1152 TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
1153 const TfLiteTensor* input;
1154 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1155 const TfLiteTensor* filter;
1156 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
1157 bool has_bias = node->inputs->size == 3;
1158 const TfLiteTensor* bias = has_bias ? GetInput(context, node, 2) : nullptr;
1159 TfLiteTensor* im2col =
1160 data->need_im2col
1161 ? &context->tensors[node->temporaries->data[data->im2col_index]]
1162 : nullptr;
1163 TfLiteTensor* hwcn_weights =
1164 data->need_hwcn_weights
1165 ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
1166 : nullptr;
1167
1168 if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
1169 TransposeFloatTensor(filter, hwcn_weights);
1170 data->have_weights_been_transposed = true;
1171 }
1172
1173 TFLITE_DCHECK_EQ(input_type, input->type);
1174 switch (input_type) { // Already know in/outtypes are same.
1175 case kTfLiteFloat32:
1176 if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
1177 if (data->is_hybrid_per_channel ||
1178 // TODO(b/162870360): Fallback to PerChannel implementation
1179 // before we have grouped hybrid convolution.
1180 data->groups != 1) {
1181 TF_LITE_ENSURE_OK(context, EvalHybridPerChannel<kernel_type>(
1182 context, node, params, data, input,
1183 filter, bias, im2col, output));
1184 } else {
1185 TfLiteTensor* accum_scratch =
1186 &context->tensors[node->temporaries
1187 ->data[data->accum_scratch_index]];
1188 TF_LITE_ENSURE_OK(context,
1189 EvalHybrid<kernel_type>(context, node, params, data,
1190 input, filter, bias, im2col,
1191 accum_scratch, output));
1192 }
1193 } else {
1194 EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
1195 im2col, hwcn_weights, output);
1196 }
1197 break;
1198 case kTfLiteUInt8:
1199 EvalQuantized<kernel_type>(context, node, params, data, input, filter,
1200 bias, im2col, output);
1201 break;
1202 case kTfLiteInt8:
1203 EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
1204 filter, bias, output, im2col);
1205 break;
1206 case kTfLiteInt16:
1207 EvalQuantizedPerChannel16x8<kernel_type>(
1208 context, node, params, data, input, filter, bias, output, im2col);
1209 break;
1210 default:
1211 TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
1212 TfLiteTypeGetName(input->type));
1213 return kTfLiteError;
1214 }
1215 return kTfLiteOk;
1216 }
1217
1218 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)1219 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
1220 const TfLiteTensor* input;
1221 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1222
1223 switch (input->type) {
1224 case kTfLiteFloat32:
1225 return EvalImpl<kernel_type, kTfLiteFloat32>(context, node);
1226 case kTfLiteUInt8:
1227 return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
1228 case kTfLiteInt8:
1229 return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
1230 case kTfLiteInt16:
1231 return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
1232 default:
1233 TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
1234 TfLiteTypeGetName(input->type));
1235 return kTfLiteError;
1236 }
1237 }
1238
1239 } // namespace conv
1240
Register_CONVOLUTION_REF()1241 TfLiteRegistration* Register_CONVOLUTION_REF() {
1242 static TfLiteRegistration r = {conv::Init, conv::Free,
1243 conv::Prepare<conv::kReference>,
1244 conv::Eval<conv::kReference>};
1245 return &r;
1246 }
1247
Register_CONVOLUTION_GENERIC_OPT()1248 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
1249 static TfLiteRegistration r = {conv::Init, conv::Free,
1250 conv::Prepare<conv::kGenericOptimized>,
1251 conv::Eval<conv::kGenericOptimized>};
1252 return &r;
1253 }
1254
Register_CONVOLUTION_GENERIC_OPT_UINT8()1255 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT_UINT8() {
1256 static TfLiteRegistration r = {
1257 conv::Init, conv::Free, conv::Prepare<conv::kGenericOptimized>,
1258 conv::EvalImpl<conv::kGenericOptimized, kTfLiteUInt8>};
1259 return &r;
1260 }
1261
Register_CONVOLUTION_MULTITHREADED_OPT()1262 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
1263 static TfLiteRegistration r = {conv::Init, conv::Free,
1264 conv::Prepare<conv::kMultithreadOptimized>,
1265 conv::Eval<conv::kMultithreadOptimized>};
1266 return &r;
1267 }
1268
Register_CONVOLUTION_CBLAS_OPT()1269 TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
1270 static TfLiteRegistration r = {conv::Init, conv::Free,
1271 conv::Prepare<conv::kCblasOptimized>,
1272 conv::Eval<conv::kCblasOptimized>};
1273 return &r;
1274 }
1275
Register_CONV_2D()1276 TfLiteRegistration* Register_CONV_2D() {
1277 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
1278 return Register_CONVOLUTION_CBLAS_OPT();
1279 #elif defined TFLITE_WITH_MULTITHREADED_EIGEN
1280 return Register_CONVOLUTION_MULTITHREADED_OPT();
1281 #else
1282 return Register_CONVOLUTION_GENERIC_OPT();
1283 #endif
1284 }
1285
1286 // Warning: Clients using this variant are responsible for ensuring that their
1287 // models only need the UINT8 type. TFLite's op registration mechanism doesn't
1288 // yet allow for more nuanced registration mechanisms.
Register_CONV_2D_UINT8()1289 TfLiteRegistration* Register_CONV_2D_UINT8() {
1290 #if defined TFLITE_WITH_RUY
1291 // TFLITE_WITH_RUY optimizes the generic kernel type.
1292 return Register_CONVOLUTION_GENERIC_OPT_UINT8();
1293 #else
1294 return Register_CONV_2D();
1295 #endif
1296 }
1297
1298 } // namespace builtin
1299 } // namespace ops
1300 } // namespace tflite
1301