1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
17
18 #include "ruy/profiler/instrumentation.h" // from @ruy
19 #include "tensorflow/lite/kernels/cpu_backend_context.h"
20 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
21 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
22 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
23 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
24 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h"
25 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
26 #include "tensorflow/lite/kernels/internal/types.h"
27
28 namespace tflite {
29 namespace optimized_integer_ops {
30 namespace depthwise_conv {
31
32 // Initializes the accumulator buffer with zeros.
DepthwiseConvInitAccBuffer(int num_output_pixels,int output_depth,int32 * acc_buffer)33 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
34 int32* acc_buffer) {
35 memset(acc_buffer, 0,
36 sizeof(acc_buffer[0]) * output_depth * num_output_pixels);
37 }
38
39 // Initializes the accumulator buffer with bias values.
DepthwiseConvHybridGeneral(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)40 inline void DepthwiseConvHybridGeneral(
41 const DepthwiseParams& params,
42 const float* input_scales, const RuntimeShape& input_shape,
43 const int8* input_data, const RuntimeShape& filter_shape,
44 const int8* filter_data, const RuntimeShape& bias_shape,
45 const float* bias_data, const RuntimeShape& output_shape,
46 float* output_data, const float* per_channel_scales,
47 const int32_t* input_offsets, int thread_start, int thread_end,
48 int thread_dim) {
49 const int stride_width = params.stride_width;
50 const int stride_height = params.stride_height;
51 const int pad_width = params.padding_values.width;
52 const int pad_height = params.padding_values.height;
53 const int depth_multiplier = params.depth_multiplier;
54 const float output_activation_min = params.float_activation_min;
55 const float output_activation_max = params.float_activation_max;
56 const int dilation_width_factor = params.dilation_width_factor;
57 const int dilation_height_factor = params.dilation_height_factor;
58 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
59 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
60 const int input_height = input_shape.Dims(1);
61 const int input_width = input_shape.Dims(2);
62 const int input_depth = input_shape.Dims(3);
63 const int filter_height = filter_shape.Dims(1);
64 const int filter_width = filter_shape.Dims(2);
65 const int output_rows = output_shape.Dims(1);
66 const int output_width = output_shape.Dims(2);
67
68 static const int kAccBufferMaxSize = 2048;
69 int32 acc_buffer[kAccBufferMaxSize];
70 TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
71 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
72 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
73 TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
74 kAccBufferActualSize);
75 TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
76 TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
77 TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
78
79 // row_accum_func will point to the core accumulation function to be used
80 // for this DepthwiseConvHybrid op.
81 using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
82 row_accum_func_t row_accum_func = nullptr;
83
84 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
85 FIXED_DEPTH_MULTIPLIER) \
86 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
87 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
88 depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \
89 row_accum_func = \
90 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
91 FIXED_DEPTH_MULTIPLIER>; \
92 }
93
94 #ifdef USE_NEON
95 // We go over our list of kernels by decreasing order of preference
96 // for the cases where multiple kernels could apply.
97
98 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
99
100 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
101 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
102 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
103 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
104 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
105 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
106 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
107 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
108 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
109 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
110
111 // Next come the strided kernels: AllowStrided=true, fixed input depth.
112 // They are a bit less efficient, but allow stride!=1.
113
114 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
115 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
116 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
117 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
118 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
119 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
120 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
121 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
122 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
123
124 // Finally, the kernels allowing a variable input depth,
125 // these are the least efficient but most general kernels.
126
127 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
128 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
129 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
130 #endif // USE_NEON
131
132 // No matching fast kernel found, use slow fallback.
133 if (!row_accum_func) {
134 row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
135 }
136
137 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
138
139 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
140 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
141 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
142
143 // Now that we have determined row_accum_func, we can start work.
144 int batch_start = 0;
145 int batch_end = batches;
146 int row_start = 0;
147 int row_end = output_rows;
148 int output_ptr_offset = 0;
149
150 switch (thread_dim) {
151 case 0:
152 TFLITE_DCHECK_GE(thread_start, 0);
153 TFLITE_DCHECK_LE(thread_end, batches);
154 batch_start = thread_start;
155 batch_end = thread_end;
156 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
157 break;
158 case 1:
159 TFLITE_DCHECK_GE(thread_start, 0);
160 TFLITE_DCHECK_LE(thread_end, output_rows);
161 row_start = thread_start;
162 row_end = thread_end;
163 output_ptr_offset = row_start * output_width * output_depth;
164 break;
165 }
166
167 float* output_ptr = output_data + output_ptr_offset;
168 int batch_step =
169 (output_rows + row_start - row_end) * output_width * output_depth;
170 for (int b = batch_start; b < batch_end; ++b) {
171 float input_scale = input_scales[b];
172 int32_t input_offset = input_offsets[b];
173 for (int out_y = row_start; out_y < row_end; ++out_y) {
174 const int in_y_origin = (out_y * stride_height) - pad_height;
175 const int filter_y_start =
176 std::max(0, (-in_y_origin + dilation_height_factor - 1) /
177 dilation_height_factor);
178 const int filter_y_end =
179 std::min(filter_height,
180 (input_height - in_y_origin + dilation_height_factor - 1) /
181 dilation_height_factor);
182 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
183 out_x_buffer_start += kOutputPixelsInAccBuffer) {
184 const int out_x_buffer_end = std::min(
185 output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
186 // We call a 'pixel' a group of activation that share all but the
187 // 'depth'/'channel' coordinate. num_output_pixels is the number of
188 // output pixels that we will accumulate in this loop iteration.
189 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
190 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, acc_buffer);
191
192 // Accumulation loop. Most of the time should be spent in here.
193 for (int filter_y = filter_y_start; filter_y < filter_y_end;
194 ++filter_y) {
195 const int in_y = in_y_origin + dilation_height_factor * filter_y;
196 row_accum_func(
197 stride_width, dilation_width_factor, input_depth, input_width,
198 input_data + in_y * input_height_stride + b * input_batch_stride,
199 -input_offset, pad_width, depth_multiplier, filter_width,
200 filter_data + filter_y * filter_height_stride, out_x_buffer_start,
201 out_x_buffer_end, output_depth, acc_buffer);
202 }
203 // Finished accumulating int32 values. Just store them as float values
204 gemmlowp::ScopedProfilingLabel label("store");
205 const int num_output_values = output_depth * num_output_pixels;
206 int c = 0;
207 while (c < output_depth) {
208 int target_output_depth = output_depth;
209
210 #ifdef USE_NEON
211 const float32x4_t output_activation_min_vec =
212 vdupq_n_f32(output_activation_min);
213 const float32x4_t output_activation_max_vec =
214 vdupq_n_f32(output_activation_max);
215 const float32x4_t input_scale_32x4 = vdupq_n_f32(input_scale);
216 for (; c <= output_depth - 4; c += 4) {
217 if ((c + 4) > output_depth) {
218 break;
219 }
220 const float32x4_t channel_scale_32x4 =
221 vld1q_f32(per_channel_scales + c);
222 const float32x4_t bias_32x4 = vld1q_f32(bias_data + c);
223 for (int n = 0; n < num_output_pixels; ++n) {
224 int loc = n * output_depth + c;
225 int32x4_t acc = vld1q_s32(acc_buffer + loc);
226 float32x4_t float_acc = vcvtq_f32_s32(acc);
227 float_acc = vmulq_f32(float_acc, channel_scale_32x4);
228 float_acc = vmulq_f32(float_acc, input_scale_32x4);
229 float_acc = vaddq_f32(float_acc, bias_32x4);
230 float_acc = vmaxq_f32(float_acc, output_activation_min_vec);
231 float_acc = vminq_f32(float_acc, output_activation_max_vec);
232 vst1q_f32(output_ptr + loc, float_acc);
233 }
234 }
235 #endif // USE_NEON
236
237 for (; c < target_output_depth; c++) {
238 for (int n = 0; n < num_output_pixels; ++n) {
239 int loc = n * output_depth + c;
240 int32 acc = acc_buffer[loc];
241 float float_acc = acc * input_scale * per_channel_scales[c];
242 float_acc += bias_data[c];
243 float_acc = std::max(float_acc, output_activation_min);
244 float_acc = std::min(float_acc, output_activation_max);
245 output_ptr[loc] = float_acc;
246 }
247 }
248 }
249 output_ptr += num_output_values;
250 }
251 }
252 output_ptr += batch_step;
253 }
254 }
255
256 } // namespace depthwise_conv
257
258 template <DepthwiseConvOutputRounding kOutputRounding>
DepthwiseConvHybridWithRounding(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)259 inline void DepthwiseConvHybridWithRounding(
260 const DepthwiseParams& params, const float* input_scales,
261 const RuntimeShape& input_shape, const int8* input_data,
262 const RuntimeShape& filter_shape, const int8* filter_data,
263 const RuntimeShape& bias_shape, const float* bias_data,
264 const RuntimeShape& output_shape, float* output_data,
265 const float* per_channel_scales, const int32_t* input_offsets,
266 int thread_start, int thread_end, int thread_dim) {
267 gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8/8bit");
268 const int depth_multiplier = params.depth_multiplier;
269 const int dilation_width_factor = params.dilation_width_factor;
270 const int dilation_height_factor = params.dilation_height_factor;
271 TFLITE_DCHECK_GE(dilation_width_factor, 1);
272 TFLITE_DCHECK_GE(dilation_height_factor, 1);
273 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
274 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
275 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
276 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
277 const int input_depth = input_shape.Dims(3);
278 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
279 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
280
281 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
282 // Jetson TX-2. This compiler does not support the offsetof() macro.
283
284 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
285 const int stride_width = params.stride_width;
286 const int stride_height = params.stride_height;
287 const int pad_width = params.padding_values.width;
288 const int pad_height = params.padding_values.height;
289
290 // Call kernel optimized for depthwise convolutions using 3x3 filters if
291 // parameters are supported.
292 if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
293 optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
294 input_shape, filter_shape, stride_width, stride_height,
295 dilation_width_factor, dilation_height_factor, pad_width, pad_height,
296 depth_multiplier, output_shape, 0, nullptr)) {
297 gemmlowp::ScopedProfilingLabel specialized_label(
298 "DepthwiseConvHybridInt8/8bit/3x3");
299 optimized_ops::depthwise_conv::DepthwiseConvHybrid3x3FilterPerChannel<
300 DepthwiseConvOutputRounding::kUpward>(
301 params, input_scales, input_shape, input_data,
302 filter_shape, filter_data, bias_shape, bias_data, output_shape,
303 output_data, per_channel_scales, input_offsets,
304 thread_start, thread_end, thread_dim);
305 return;
306 }
307 #endif
308
309 gemmlowp::ScopedProfilingLabel specialized_label(
310 "DepthwiseConvHybridInt8/8bit/General");
311 depthwise_conv::DepthwiseConvHybridGeneral(
312 params, input_scales, input_shape, input_data,
313 filter_shape, filter_data, bias_shape, bias_data, output_shape,
314 output_data, per_channel_scales, input_offsets,
315 thread_start, thread_end, thread_dim);
316 }
317
DepthwiseConvHybridImpl(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)318 inline void DepthwiseConvHybridImpl(
319 const DepthwiseParams& params, const float* input_scales,
320 const RuntimeShape& input_shape, const int8* input_data,
321 const RuntimeShape& filter_shape, const int8* filter_data,
322 const RuntimeShape& bias_shape, const float* bias_data,
323 const RuntimeShape& output_shape, float* output_data,
324 const float* per_channel_scales, const int32_t* input_offsets,
325 int thread_start, int thread_end, int thread_dim) {
326 return DepthwiseConvHybridWithRounding<
327 DepthwiseConvOutputRounding::kAwayFromZero>(
328 params, input_scales, input_shape, input_data,
329 filter_shape, filter_data, bias_shape, bias_data, output_shape,
330 output_data, per_channel_scales, input_offsets,
331 thread_start, thread_end, thread_dim);
332 }
333
334 template <typename T, typename TS>
335 struct DepthwiseConvHybridWorkerTask : cpu_backend_threadpool::Task {
DepthwiseConvHybridWorkerTaskDepthwiseConvHybridWorkerTask336 DepthwiseConvHybridWorkerTask(const DepthwiseParams& params,
337 const float* input_scales,
338 const RuntimeShape& input_shape,
339 const T* input_data,
340 const RuntimeShape& filter_shape,
341 const T* filter_data,
342 const RuntimeShape& bias_shape,
343 const TS* bias_data,
344 const RuntimeShape& output_shape,
345 float* output_data,
346 const float* per_channel_scales,
347 const int32_t* input_offsets,
348 int thread_start, int thread_end,
349 int thread_dim)
350 : params(params),
351 input_scales(input_scales),
352 input_shape(input_shape),
353 input_data(input_data),
354 filter_shape(filter_shape),
355 filter_data(filter_data),
356 bias_shape(bias_shape),
357 bias_data(bias_data),
358 output_shape(output_shape),
359 output_data(output_data),
360 per_channel_scales(per_channel_scales),
361 input_offsets(input_offsets),
362 thread_start(thread_start),
363 thread_end(thread_end),
364 thread_dim(thread_dim) {}
365
RunDepthwiseConvHybridWorkerTask366 void Run() override {
367 DepthwiseConvHybridImpl(params, input_scales, input_shape,
368 input_data, filter_shape, filter_data,
369 bias_shape, bias_data, output_shape,
370 output_data, per_channel_scales, input_offsets,
371 thread_start, thread_end, thread_dim);
372 }
373
374 private:
375 const DepthwiseParams& params;
376 const float* input_scales;
377 const RuntimeShape& input_shape;
378 const T* input_data;
379 const RuntimeShape& filter_shape;
380 const T* filter_data;
381 const RuntimeShape& bias_shape;
382 const TS* bias_data;
383 const RuntimeShape& output_shape;
384 float* output_data;
385 const float* per_channel_scales;
386 const int32_t* input_offsets;
387 int thread_start;
388 int thread_end;
389 int thread_dim;
390 };
391
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,int32_t * input_offsets,CpuBackendContext * cpu_backend_context)392 inline void DepthwiseConvHybridPerChannel(
393 const DepthwiseParams& params, const float* input_scales,
394 const RuntimeShape& input_shape, const int8* input_data,
395 const RuntimeShape& filter_shape, const int8* filter_data,
396 const RuntimeShape& bias_shape, const float* bias_data,
397 const RuntimeShape& output_shape, float* output_data,
398 const float* per_channel_scales, int32_t* input_offsets,
399 CpuBackendContext* cpu_backend_context) {
400 gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8");
401 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
402 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
403 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
404
405 const int output_batches = output_shape.Dims(0);
406 const int output_rows = output_shape.Dims(1);
407 int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
408 int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
409 int thread_dim, thread_count, thread_dim_size;
410 if (thread_count_batch > thread_count_row) {
411 thread_dim = 0;
412 thread_dim_size = output_batches;
413 thread_count = thread_count_batch;
414 } else {
415 thread_dim = 1;
416 thread_dim_size = output_rows;
417 thread_count = thread_count_row;
418 }
419
420 const int max_threads = cpu_backend_context->max_num_threads();
421 thread_count = std::max(1, std::min(thread_count, max_threads));
422
423 if (thread_count == 1) {
424 DepthwiseConvHybridImpl(params, input_scales, input_shape,
425 input_data, filter_shape, filter_data, bias_shape,
426 bias_data, output_shape, output_data,
427 per_channel_scales, input_offsets,
428 /*thread_start=*/0, /*thread_end=*/output_rows,
429 /*thread_dim=*/1);
430 } else {
431 std::vector<DepthwiseConvHybridWorkerTask<int8, float>> tasks;
432 // TODO(b/131746020) don't create new heap allocations every time.
433 // At least we make it a single heap allocation by using reserve().
434 tasks.reserve(thread_count);
435 int thread_start = 0;
436 for (int i = 0; i < thread_count; ++i) {
437 int thread_end =
438 thread_start + (thread_dim_size - thread_start) / (thread_count - i);
439 tasks.emplace_back(params, input_scales, input_shape,
440 input_data, filter_shape, filter_data, bias_shape,
441 bias_data, output_shape, output_data,
442 per_channel_scales, input_offsets, thread_start,
443 thread_end, thread_dim);
444 thread_start = thread_end;
445 }
446 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
447 cpu_backend_context);
448 }
449 }
450
451 } // namespace optimized_integer_ops
452 } // namespace tflite
453
454 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
455