1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <sys/types.h>
16
17 #include <algorithm>
18 #include <cmath>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 #include <vector>
25
26 #include <gtest/gtest.h>
27 #include "ruy/context.h" // from @ruy
28 #include "tensorflow/lite/kernels/cpu_backend_context.h"
29 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
30 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
31 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
32 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
33 #include "tensorflow/lite/kernels/internal/test_util.h"
34 #include "tensorflow/lite/kernels/internal/types.h"
35
36 #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
37 #include "absl/strings/substitute.h"
38 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h"
39 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
40 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
41 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h"
42 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
43
44 namespace tflite {
45 namespace {
46
47 using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
48 using optimized_ops::depthwise_conv::QuantizationType;
49 using optimized_ops::depthwise_conv::QuantizationTypeImpl;
50 using ::testing::Bool;
51 using ::testing::Values;
52
53 #if defined(__aarch64__)
54 static constexpr bool kLooseIntrinsicsTolerance = false;
55 #else
56 static constexpr bool kLooseIntrinsicsTolerance = true;
57 #endif
58
59 // Currently, this is used in place of a Boolean "is symmetric?".
60 enum class ParamsSpecialization {
61 kNone = 0,
62 kSymmetric, // Symmetric quantization: zero represented by 128.
63 };
64
65 static constexpr int kSymmetricZeroPoint = 128;
66
67 // Extend coverage distribution in a specific aspect, either explicitly chosen
68 // or randomly chosen as in a mixture distribution.
69 enum class CoverageExtension {
70 kNone = 0,
71 kLargeHeights = 1,
72 kLargeWidths = 2,
73 kNumOptions
74 };
75
76 // The TestParam structure below is the preferred parameterization of tests. A
77 // tuple version is defined in order to support value-parameterized tests.
78 typedef std::tuple<DepthwiseConvImplementation, int, QuantizationType, bool,
79 bool, bool, DepthwiseConvOutputRounding, int, bool>
80 TestParamTuple;
81
82 struct TestParam {
83 TestParam() = default;
84
TestParamtflite::__anon25a17b120111::TestParam85 explicit TestParam(TestParamTuple param_tuple)
86 : forced_invocation(::testing::get<0>(param_tuple)),
87 tests_to_run(::testing::get<1>(param_tuple)),
88 quantization_type(::testing::get<2>(param_tuple)),
89 test_stride(::testing::get<3>(param_tuple)),
90 test_pad(::testing::get<4>(param_tuple)),
91 test_depth_multiplier(::testing::get<5>(param_tuple)),
92 output_rounding(::testing::get<6>(param_tuple)),
93 num_threads(::testing::get<7>(param_tuple)),
94 loose_tolerance(::testing::get<8>(param_tuple)) {}
95
TestNameSuffixtflite::__anon25a17b120111::TestParam96 static std::string TestNameSuffix(
97 const ::testing::TestParamInfo<TestParamTuple>& info) {
98 const TestParam param(info.param);
99 return absl::Substitute(
100 "invocation_$0_quantization_$1_stride_$2_pad_$3_depth_mult_$4",
101 static_cast<int>(param.forced_invocation),
102 static_cast<int>(param.quantization_type), param.test_stride,
103 param.test_pad, param.test_depth_multiplier);
104 }
105
106 DepthwiseConvImplementation forced_invocation =
107 DepthwiseConvImplementation::kNone;
108 int tests_to_run = 0;
109 QuantizationType quantization_type = QuantizationType::kNonPerChannelUint8;
110 bool test_stride = false;
111 bool test_pad = false;
112 bool test_depth_multiplier = false;
113 DepthwiseConvOutputRounding output_rounding =
114 DepthwiseConvOutputRounding::kNone;
115 int num_threads = 1;
116 bool loose_tolerance = false;
117 };
118
119 template <QuantizationType quantization_type>
DispatchDepthwiseConvGeneral(const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,const RuntimeShape & output_shape,typename QuantizationTypeImpl<quantization_type>::ExternalType * output_data,int thread_start,int thread_end,int thread_dim)120 inline void DispatchDepthwiseConvGeneral(
121 const DepthwiseParams& params, const RuntimeShape& input_shape,
122 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
123 input_data,
124 const RuntimeShape& filter_shape,
125 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
126 filter_data,
127 const RuntimeShape& bias_shape, const int32* bias_data,
128 const std::int32_t* output_shift_adjust,
129 const std::int32_t* output_multiplier_adjust,
130 const RuntimeShape& output_shape,
131 typename QuantizationTypeImpl<quantization_type>::ExternalType* output_data,
132 int thread_start, int thread_end, int thread_dim) {
133 optimized_ops::depthwise_conv::DepthwiseConvGeneral(
134 params, input_shape, input_data, filter_shape, filter_data, bias_shape,
135 bias_data, output_shape, output_data, thread_start, thread_end,
136 thread_dim);
137 }
138
139 template <>
DispatchDepthwiseConvGeneral(const DepthwiseParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,const RuntimeShape & output_shape,int8 * output_data,int thread_start,int thread_end,int thread_dim)140 inline void DispatchDepthwiseConvGeneral<QuantizationType::kPerChannelInt8>(
141 const DepthwiseParams& params, const RuntimeShape& input_shape,
142 const int8* input_data, const RuntimeShape& filter_shape,
143 const int8* filter_data, const RuntimeShape& bias_shape,
144 const int32* bias_data, const std::int32_t* output_shift_adjust,
145 const std::int32_t* output_multiplier_adjust,
146 const RuntimeShape& output_shape, int8* output_data, int thread_start,
147 int thread_end, int thread_dim) {
148 optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(
149 params, output_multiplier_adjust, output_shift_adjust, input_shape,
150 input_data, filter_shape, filter_data, bias_shape, bias_data,
151 output_shape, output_data, thread_start, thread_end, thread_dim);
152 }
153
154 template <QuantizationType quantization_type>
DispatchDepthwiseConvImpl(const TestParam & test_param,const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,typename QuantizationTypeImpl<quantization_type>::ExternalType * output_data)155 inline void DispatchDepthwiseConvImpl(
156 const TestParam& test_param, const DepthwiseParams& params,
157 const RuntimeShape& input_shape,
158 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
159 input_data,
160 const RuntimeShape& filter_shape,
161 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
162 filter_data,
163 const RuntimeShape& bias_shape, const int32* bias_data,
164 const RuntimeShape& output_shape,
165 typename QuantizationTypeImpl<quantization_type>::ExternalType*
166 output_data) {
167 switch (test_param.forced_invocation) {
168 case DepthwiseConvImplementation::kUseNeon3x3: {
169 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
170 // Jetson TX-2. This compiler does not support the offsetof() macro.
171 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
172 const int stride_width = params.stride_width;
173 const int stride_height = params.stride_height;
174 const int pad_width = params.padding_values.width;
175 const int pad_height = params.padding_values.height;
176 const int output_shift = params.output_shift;
177 const int depth_multiplier = params.depth_multiplier;
178 const int dilation_width_factor = params.dilation_width_factor;
179 const int dilation_height_factor = params.dilation_height_factor;
180
181 // Check that parameter combination is supported.
182 const bool basic_3x3_kernel_supported =
183 optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported(
184 input_shape, filter_shape, stride_width, stride_height,
185 dilation_width_factor, dilation_height_factor, pad_width,
186 pad_height, depth_multiplier, output_shape, output_shift);
187 ASSERT_TRUE(basic_3x3_kernel_supported)
188 << "pad_width = " << params.padding_values.width
189 << " pad_height = " << params.padding_values.height
190 << " input_width = " << input_shape.Dims(2)
191 << " input_height = " << input_shape.Dims(1)
192 << " output_width = " << output_shape.Dims(2)
193 << " output_height = " << output_shape.Dims(1);
194
195 // Call kernel optimized for depthwise convolutions using 3x3 filters.
196 switch (test_param.output_rounding) {
197 case DepthwiseConvOutputRounding::kAwayFromZero:
198 optimized_ops::depthwise_conv::DepthwiseConv3x3Filter<
199 DepthwiseConvOutputRounding::kAwayFromZero>(
200 params, input_shape, input_data, filter_shape, filter_data,
201 bias_shape, bias_data, output_shape, output_data,
202 /*thread_start=*/0,
203 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
204 return;
205 case DepthwiseConvOutputRounding::kUpward:
206 optimized_ops::depthwise_conv::DepthwiseConv3x3Filter<
207 DepthwiseConvOutputRounding::kUpward>(
208 params, input_shape, input_data, filter_shape, filter_data,
209 bias_shape, bias_data, output_shape, output_data,
210 /*thread_start=*/0,
211 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
212 return;
213 default:
214 break;
215 }
216 #endif
217 break;
218 }
219 case DepthwiseConvImplementation::kUseNeon3x3DotProduct: {
220 // This is compiled-in even if dot-product instructions are unavailable.
221 // However, tests should skip dot-product testing in that case and not
222 // call this code.
223 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
224 defined(__clang__)
225 DotProduct3x3KernelType kernel_type =
226 optimized_ops::depthwise_conv::CategorizeDotProductKernel(
227 input_shape, filter_shape, output_shape, params);
228
229 ASSERT_NE(kernel_type, DotProduct3x3KernelType::kNone)
230 << "Kernel type = " << static_cast<int>(kernel_type);
231
232 optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
233 DepthwiseConvImplementation::kUseNeon3x3DotProduct,
234 quantization_type>(
235 params, input_shape, input_data, filter_shape, filter_data,
236 bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
237 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
238 return;
239 #endif
240 break;
241 }
242 case DepthwiseConvImplementation::kUseCModel3x3DotProduct: {
243 DotProduct3x3KernelType kernel_type =
244 optimized_ops::depthwise_conv::CategorizeDotProductKernel(
245 input_shape, filter_shape, output_shape, params);
246
247 ASSERT_TRUE(
248 kernel_type == DotProduct3x3KernelType::kPlain ||
249 kernel_type == DotProduct3x3KernelType::kStride2 ||
250 kernel_type ==
251 DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
252 kernel_type ==
253 DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
254 << "Kernel type = " << static_cast<int>(kernel_type)
255 << " depth_multiplier = " << params.depth_multiplier
256 << " pad_width = " << params.padding_values.width
257 << " pad_height = " << params.padding_values.height
258 << " stride_width = " << params.stride_width
259 << " stride_height = " << params.stride_height
260 << " input_width = " << input_shape.Dims(2)
261 << " input_height = " << input_shape.Dims(1)
262 << " output_width = " << output_shape.Dims(2)
263 << " output_height = " << output_shape.Dims(1)
264 << " depth = " << input_shape.Dims(3)
265 << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
266 << " input_offset = " << params.input_offset;
267
268 optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
269 DepthwiseConvImplementation::kUseCModel3x3DotProduct,
270 quantization_type>(
271 params, input_shape, input_data, filter_shape, filter_data,
272 bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
273 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
274 return;
275 }
276 case DepthwiseConvImplementation::kUseUnwound3x3DotProduct: {
277 DotProduct3x3KernelType kernel_type =
278 optimized_ops::depthwise_conv::CategorizeDotProductKernel(
279 input_shape, filter_shape, output_shape, params);
280 ASSERT_TRUE(
281 kernel_type == DotProduct3x3KernelType::kPlain ||
282 kernel_type == DotProduct3x3KernelType::kStride2 ||
283 kernel_type ==
284 DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
285 kernel_type ==
286 DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
287 optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
288 DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
289 quantization_type>(
290 params, input_shape, input_data, filter_shape, filter_data,
291 bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
292 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
293 return;
294 }
295 case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct: {
296 #if defined(USE_NEON)
297 DotProduct3x3KernelType kernel_type =
298 optimized_ops::depthwise_conv::CategorizeDotProductKernel(
299 input_shape, filter_shape, output_shape, params);
300
301 ASSERT_TRUE(
302 kernel_type == DotProduct3x3KernelType::kPlain ||
303 kernel_type == DotProduct3x3KernelType::kStride2 ||
304 kernel_type ==
305 DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
306 kernel_type ==
307 DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
308 optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
309 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
310 quantization_type>(
311 params, input_shape, input_data, filter_shape, filter_data,
312 bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
313 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
314 return;
315 #else
316 break;
317 #endif
318 }
319 case DepthwiseConvImplementation::kUseGenericKernel: {
320 DispatchDepthwiseConvGeneral<quantization_type>(
321 params, input_shape, input_data, filter_shape, filter_data,
322 bias_shape, bias_data, nullptr, nullptr, output_shape, output_data,
323 /*thread_start=*/0,
324 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
325 return;
326 }
327 case DepthwiseConvImplementation::kNone:
328 default:
329 break;
330 }
331
332 EXPECT_EQ(test_param.forced_invocation, DepthwiseConvImplementation::kNone)
333 << "TODO(b/118426582) requested kernel was not invoked / available yet: "
334 << " forced_invocation = "
335 << static_cast<int>(test_param.forced_invocation)
336 << " depth_multiplier = " << params.depth_multiplier
337 << " pad_width = " << params.padding_values.width
338 << " pad_height = " << params.padding_values.height
339 << " stride_width = " << params.stride_width
340 << " stride_height = " << params.stride_height
341 << " input_width = " << input_shape.Dims(2)
342 << " input_height = " << input_shape.Dims(1)
343 << " output_width = " << output_shape.Dims(2)
344 << " output_height = " << output_shape.Dims(1)
345 << " depth = " << input_shape.Dims(3)
346 << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
347 << " input_offset = " << params.input_offset;
348
349 CpuBackendContext backend_context;
350 backend_context.SetMaxNumThreads(test_param.num_threads);
351 optimized_ops::DepthwiseConv<
352 typename QuantizationTypeImpl<quantization_type>::ExternalType, int32>(
353 params, input_shape, input_data, filter_shape, filter_data, bias_shape,
354 bias_data, output_shape, output_data, &backend_context);
355 }
356
357 template <>
DispatchDepthwiseConvImpl(const TestParam & test_param,const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<QuantizationType::kPerChannelInt8>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<QuantizationType::kPerChannelInt8>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,typename QuantizationTypeImpl<QuantizationType::kPerChannelInt8>::ExternalType * output_data)358 inline void DispatchDepthwiseConvImpl<QuantizationType::kPerChannelInt8>(
359 const TestParam& test_param, const DepthwiseParams& params,
360 const RuntimeShape& input_shape,
361 const typename QuantizationTypeImpl<
362 QuantizationType::kPerChannelInt8>::ExternalType* input_data,
363 const RuntimeShape& filter_shape,
364 const typename QuantizationTypeImpl<
365 QuantizationType::kPerChannelInt8>::ExternalType* filter_data,
366 const RuntimeShape& bias_shape, const int32* bias_data,
367 const RuntimeShape& output_shape,
368 typename QuantizationTypeImpl<
369 QuantizationType::kPerChannelInt8>::ExternalType* output_data) {
370 static constexpr QuantizationType quantization_type =
371 QuantizationType::kPerChannelInt8;
372
373 switch (test_param.forced_invocation) {
374 case DepthwiseConvImplementation::kUseNeon3x3: {
375 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
376 // Jetson TX-2. This compiler does not support the offsetof() macro.
377 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
378 const int stride_width = params.stride_width;
379 const int stride_height = params.stride_height;
380 const int pad_width = params.padding_values.width;
381 const int pad_height = params.padding_values.height;
382 const int output_shift = params.output_shift;
383 const int depth_multiplier = params.depth_multiplier;
384 const int dilation_width_factor = params.dilation_width_factor;
385 const int dilation_height_factor = params.dilation_height_factor;
386
387 // Check that parameter combination is supported.
388 const bool basic_3x3_kernel_supported =
389 optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported(
390 input_shape, filter_shape, stride_width, stride_height,
391 dilation_width_factor, dilation_height_factor, pad_width,
392 pad_height, depth_multiplier, output_shape, output_shift);
393 ASSERT_TRUE(basic_3x3_kernel_supported)
394 << "pad_width = " << params.padding_values.width
395 << " pad_height = " << params.padding_values.height
396 << " input_width = " << input_shape.Dims(2)
397 << " input_height = " << input_shape.Dims(1)
398 << " output_width = " << output_shape.Dims(2)
399 << " output_height = " << output_shape.Dims(1);
400
401 // Call kernel optimized for depthwise convolutions using 3x3 filters.
402 switch (test_param.output_rounding) {
403 case DepthwiseConvOutputRounding::kUpward:
404 optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
405 DepthwiseConvOutputRounding::kUpward>(
406 params, params.output_multiplier_per_channel,
407 params.output_shift_per_channel, input_shape, input_data,
408 filter_shape, filter_data, bias_shape, bias_data, output_shape,
409 output_data,
410 /*thread_start=*/0,
411 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
412 return;
413 case DepthwiseConvOutputRounding::kAwayFromZero:
414 default:
415 // Error case, unsupported. This break sends execution down to
416 // comparison with DepthwiseConvImplementation::kNone later that
417 // reports failure.
418 break;
419 }
420 #endif
421 break;
422 }
423 case DepthwiseConvImplementation::kUseNeon3x3DotProduct: {
424 // This is compiled-in even if dot-product instructions are unavailable.
425 // However, tests should skip dot-product testing in that case and not
426 // call this code.
427 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
428 defined(__clang__)
429 DotProduct3x3KernelType kernel_type =
430 optimized_ops::depthwise_conv::CategorizeDotProductKernel<
431 QuantizationType::kPerChannelInt8>(
432 input_shape, filter_shape, output_shape, params,
433 params.output_shift_per_channel);
434
435 ASSERT_NE(kernel_type, DotProduct3x3KernelType::kNone)
436 << "Kernel type = " << static_cast<int>(kernel_type);
437
438 optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
439 DepthwiseConvImplementation::kUseNeon3x3DotProduct,
440 quantization_type>(
441 params, input_shape, input_data, filter_shape, filter_data,
442 bias_shape, bias_data, output_shape, output_data,
443 /*thread_start=*/0,
444 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
445 return;
446 #endif
447 break;
448 }
449 case DepthwiseConvImplementation::kUseCModel3x3DotProduct:
450 case DepthwiseConvImplementation::kUseUnwound3x3DotProduct:
451 break;
452 case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct: {
453 #if defined(USE_NEON)
454 DotProduct3x3KernelType kernel_type =
455 optimized_ops::depthwise_conv::CategorizeDotProductKernel<
456 QuantizationType::kPerChannelInt8>(
457 input_shape, filter_shape, output_shape, params,
458 params.output_shift_per_channel);
459
460 ASSERT_TRUE(
461 kernel_type == DotProduct3x3KernelType::kPlain ||
462 kernel_type == DotProduct3x3KernelType::kStride2 ||
463 kernel_type ==
464 DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
465 kernel_type ==
466 DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
467 optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
468 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
469 quantization_type>(
470 params, input_shape, input_data, filter_shape, filter_data,
471 bias_shape, bias_data, output_shape, output_data,
472 /*thread_start=*/0,
473 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
474 return;
475 #else
476 break;
477 #endif
478 }
479 case DepthwiseConvImplementation::kUseGenericKernel: {
480 EXPECT_NE(params.output_multiplier_per_channel, nullptr);
481 EXPECT_NE(params.output_shift_per_channel, nullptr);
482 DispatchDepthwiseConvGeneral<quantization_type>(
483 params, input_shape, input_data, filter_shape, filter_data,
484 bias_shape, bias_data, params.output_shift_per_channel,
485 params.output_multiplier_per_channel, output_shape, output_data,
486 /*thread_start=*/0,
487 /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
488 return;
489 }
490 case DepthwiseConvImplementation::kNone:
491 default:
492 break;
493 }
494
495 EXPECT_EQ(test_param.forced_invocation, DepthwiseConvImplementation::kNone)
496 << "Requested kernel was not invoked / available yet: "
497 << " forced_invocation = "
498 << static_cast<int>(test_param.forced_invocation)
499 << " depth_multiplier = " << params.depth_multiplier
500 << " pad_width = " << params.padding_values.width
501 << " pad_height = " << params.padding_values.height
502 << " stride_width = " << params.stride_width
503 << " stride_height = " << params.stride_height
504 << " input_width = " << input_shape.Dims(2)
505 << " input_height = " << input_shape.Dims(1)
506 << " output_width = " << output_shape.Dims(2)
507 << " output_height = " << output_shape.Dims(1)
508 << " depth = " << input_shape.Dims(3)
509 << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
510 << " input_offset = " << params.input_offset;
511
512 EXPECT_NE(params.output_multiplier_per_channel, nullptr);
513 EXPECT_NE(params.output_shift_per_channel, nullptr);
514
515 CpuBackendContext backend_context;
516 backend_context.SetMaxNumThreads(test_param.num_threads);
517 optimized_integer_ops::DepthwiseConvPerChannel(
518 params, params.output_multiplier_per_channel,
519 params.output_shift_per_channel, input_shape, input_data, filter_shape,
520 filter_data, bias_shape, bias_data, output_shape, output_data,
521 &backend_context);
522 }
523
524 template <QuantizationType quantization_type>
DispatchDepthwiseConv(const TestParam & test_param,const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,typename QuantizationTypeImpl<quantization_type>::ExternalType * output_data)525 inline void DispatchDepthwiseConv(
526 const TestParam& test_param, const DepthwiseParams& params,
527 const RuntimeShape& input_shape,
528 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
529 input_data,
530 const RuntimeShape& filter_shape,
531 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
532 filter_data,
533 const RuntimeShape& bias_shape, const int32* bias_data,
534 const RuntimeShape& output_shape,
535 typename QuantizationTypeImpl<quantization_type>::ExternalType*
536 output_data) {
537 DispatchDepthwiseConvImpl<quantization_type>(
538 test_param, params, input_shape, input_data, filter_shape, filter_data,
539 bias_shape, bias_data, output_shape, output_data);
540 }
541
542 template <QuantizationType quantization_type>
543 struct ReferenceRunner {};
544
545 template <>
546 struct ReferenceRunner<QuantizationType::kNonPerChannelUint8> {
Runtflite::__anon25a17b120111::ReferenceRunner547 static inline void Run(
548 const TestParam& test_param, const tflite::DepthwiseParams& op_params,
549 const uint8* input_data, const RuntimeShape& input_shape,
550 const uint8* filter_data, const RuntimeShape& filter_shape,
551 const std::int32_t* bias_data, const RuntimeShape& bias_shape,
552 const RuntimeShape& output_shape, uint8* reference_output_data) {
553 switch (test_param.output_rounding) {
554 case DepthwiseConvOutputRounding::kUpward:
555 reference_ops::depthwise_conv::DepthwiseConvBasicKernel<
556 DepthwiseConvOutputRounding::kUpward>::Run(op_params, input_shape,
557 input_data, filter_shape,
558 filter_data, bias_shape,
559 bias_data, output_shape,
560 reference_output_data);
561 break;
562 case DepthwiseConvOutputRounding::kAwayFromZero:
563 reference_ops::DepthwiseConv(
564 op_params, input_shape, input_data, filter_shape, filter_data,
565 bias_shape, bias_data, output_shape, reference_output_data);
566 break;
567 case DepthwiseConvOutputRounding::kNone:
568 default:
569 EXPECT_NE(test_param.output_rounding,
570 DepthwiseConvOutputRounding::kNone);
571 break;
572 }
573 }
574 };
575
576 template <>
577 struct ReferenceRunner<QuantizationType::kPerChannelInt8> {
Runtflite::__anon25a17b120111::ReferenceRunner578 static inline void Run(
579 const TestParam& test_param, const tflite::DepthwiseParams& op_params,
580 const int8* input_data, const RuntimeShape& input_shape,
581 const int8* filter_data, const RuntimeShape& filter_shape,
582 const std::int32_t* bias_data, const RuntimeShape& bias_shape,
583 const RuntimeShape& output_shape, int8* reference_output_data) {
584 switch (test_param.output_rounding) {
585 case DepthwiseConvOutputRounding::kUpward:
586 reference_ops::depthwise_conv::DepthwiseConvBasicKernel<
587 DepthwiseConvOutputRounding::kUpward>::
588 RunPerChannel(op_params, input_shape, input_data, filter_shape,
589 filter_data, bias_shape, bias_data, output_shape,
590 reference_output_data);
591 break;
592 case DepthwiseConvOutputRounding::kAwayFromZero:
593 reference_integer_ops::DepthwiseConvPerChannel(
594 op_params, op_params.output_multiplier_per_channel,
595 op_params.output_shift_per_channel, input_shape, input_data,
596 filter_shape, filter_data, bias_shape, bias_data, output_shape,
597 reference_output_data);
598 break;
599 case DepthwiseConvOutputRounding::kNone:
600 default:
601 EXPECT_NE(test_param.output_rounding,
602 DepthwiseConvOutputRounding::kNone);
603 break;
604 }
605 }
606 };
607
608 template <QuantizationType quantization_type>
609 // Runs the DepthwiseConv and compares against the reference implementation.
TestOneDepthwiseConvWithGivenOutputShift(const TestParam & test_param,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & input_shape,std::int32_t input_offset,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & filter_shape,std::int32_t filter_offset,const std::int32_t * bias_data,const RuntimeShape & bias_shape,int stride,PaddingType padding_type,int pad_width,int pad_height,int depth_multiplier,std::int32_t output_offset,std::int32_t output_multiplier,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,int output_shift,std::int32_t output_activation_min,std::int32_t output_activation_max,const RuntimeShape & output_shape)610 int TestOneDepthwiseConvWithGivenOutputShift(
611 const TestParam& test_param,
612 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
613 input_data,
614 const RuntimeShape& input_shape, std::int32_t input_offset,
615 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
616 filter_data,
617 const RuntimeShape& filter_shape, std::int32_t filter_offset,
618 const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
619 PaddingType padding_type, int pad_width, int pad_height,
620 int depth_multiplier, std::int32_t output_offset,
621 std::int32_t output_multiplier, const std::int32_t* output_shift_adjust,
622 const std::int32_t* output_multiplier_adjust, int output_shift,
623 std::int32_t output_activation_min, std::int32_t output_activation_max,
624 const RuntimeShape& output_shape) {
625 const int output_buffer_size = output_shape.FlatSize();
626 std::vector<typename QuantizationTypeImpl<quantization_type>::ExternalType>
627 output_data(output_buffer_size, 42);
628 std::vector<typename QuantizationTypeImpl<quantization_type>::ExternalType>
629 reference_output_data(output_buffer_size);
630
631 tflite::DepthwiseParams op_params;
632 op_params.padding_type = padding_type;
633 op_params.padding_values.width = pad_width;
634 op_params.padding_values.height = pad_height;
635 op_params.stride_width = stride;
636 op_params.stride_height = stride;
637 op_params.dilation_width_factor = 1;
638 op_params.dilation_height_factor = 1;
639 op_params.depth_multiplier = depth_multiplier;
640 op_params.quantized_activation_min = output_activation_min;
641 op_params.quantized_activation_max = output_activation_max;
642 op_params.input_offset = input_offset;
643 op_params.weights_offset = filter_offset;
644 op_params.output_offset = output_offset;
645 op_params.output_multiplier = output_multiplier;
646 op_params.output_shift = -output_shift;
647
648 const int depth = output_shape.Dims(3);
649 std::vector<int32> output_multiplier_per_channel(depth, output_multiplier);
650 std::vector<int32> output_shift_per_channel(depth, -output_shift);
651 if (output_multiplier_adjust != nullptr) {
652 for (int i = 0; i < depth; ++i) {
653 output_multiplier_per_channel[i] += output_multiplier_adjust[i];
654 output_shift_per_channel[i] += output_shift_adjust[i];
655 output_shift_per_channel[i] = std::max(-31, output_shift_per_channel[i]);
656 }
657 }
658 op_params.output_multiplier_per_channel =
659 output_multiplier_per_channel.data();
660 op_params.output_shift_per_channel =
661 output_shift_per_channel.data(); // Negated wrt output_shift.
662
663 ReferenceRunner<quantization_type>::Run(
664 test_param, op_params, input_data, input_shape, filter_data, filter_shape,
665 bias_data, bias_shape, output_shape, reference_output_data.data());
666
667 DispatchDepthwiseConv<quantization_type>(
668 test_param, op_params, input_shape, input_data, filter_shape, filter_data,
669 bias_shape, bias_data, output_shape, output_data.data());
670 int saturated_min = 0;
671 int saturated_max = 0;
672 std::vector<int> diff(output_buffer_size);
673 std::int64_t sum_diff = 0;
674 std::int64_t sum_abs_diff = 0;
675 for (int i = 0; i < output_buffer_size; i++) {
676 diff[i] = static_cast<int>(output_data[i]) -
677 static_cast<int>(reference_output_data[i]);
678 sum_diff += diff[i];
679 sum_abs_diff += std::abs(diff[i]);
680 saturated_min += output_data[i] == output_activation_min;
681 saturated_max += output_data[i] == output_activation_max;
682 }
683 // These stats help understand test failures.
684 std::sort(std::begin(diff), std::end(diff));
685 const int min_diff = diff.front();
686 const int max_diff = diff.back();
687 const int median_diff = diff[diff.size() / 2];
688 const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
689 const float mean_abs_diff =
690 static_cast<float>(sum_abs_diff) / output_buffer_size;
691
692 int diff_mean_tolerance = 1;
693 int diff_median_tolerance = 0;
694 // The tolerance that we apply to means is tight, but we allow for a rounding
695 // difference in one pixel, and loosen by another 1% for float comparison.
696 float mean_tolerance = std::max(
697 1e-5f, 1.01f / output_buffer_size * std::sqrt(1.f * depth_multiplier));
698 if (test_param.loose_tolerance) {
699 mean_tolerance = 500.f;
700 diff_mean_tolerance = 256;
701 diff_median_tolerance = 225;
702 }
703
704 // Normally we should require bit-for-bit exact results. Unfortunately a bug
705 // in the Intel arm_neon_sse.h translation header that we use for x86 tests
706 // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
707 // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
708 // few off-by-one errors for now, yet still ensure that no more than a small
709 // minority of values are wrong.
710 EXPECT_LT(std::abs(mean_diff), mean_tolerance);
711 EXPECT_LT(mean_abs_diff, mean_tolerance);
712 EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
713 EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
714 EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
715 EXPECT_TRUE(std::abs(mean_diff) < mean_tolerance &&
716 mean_abs_diff < mean_tolerance &&
717 std::abs(median_diff) <= diff_median_tolerance &&
718 std::abs(min_diff) <= diff_mean_tolerance &&
719 std::abs(max_diff) <= diff_mean_tolerance)
720 << "pad_width = " << op_params.padding_values.width
721 << " pad_height = " << op_params.padding_values.height
722 << " input_width = " << input_shape.Dims(2)
723 << " input_height = " << input_shape.Dims(1)
724 << " output_width = " << output_shape.Dims(2)
725 << " output_height = " << output_shape.Dims(1)
726 << " depth = " << input_shape.Dims(3)
727 << " output_offset = " << op_params.output_offset
728 << " output_multiplier = " << op_params.output_multiplier
729 << " output_shift = " << op_params.output_shift;
730
731 if (saturated_min > 2 * saturated_max) {
732 return -1;
733 }
734 if (saturated_max > 2 * saturated_min) {
735 return 1;
736 }
737 return 0;
738 }
739
740 // The point of this function is that we can't practically know which
741 // output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
742 // could do some statistics for large size, but they would be fragile at smaller
743 // sizes), and guessing wrong would mean that all the values get saturated so
744 // the test becomes vacuous. So we just bisect our way to reasonable
745 // output_shift values.
746 template <QuantizationType quantization_type>
TestOneDepthwiseConvBisectOutputShift(const TestParam & test_param,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & input_shape,std::int32_t input_offset,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & filter_shape,std::int32_t filter_offset,const std::int32_t * bias_data,const RuntimeShape & bias_shape,int stride,PaddingType padding_type,int pad_width,int pad_height,int depth_multiplier,std::int32_t output_offset,std::int32_t output_multiplier,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,int output_activation_bisect_start,int output_activation_bisect_end,std::int32_t output_activation_min,std::int32_t output_activation_max,const RuntimeShape & output_shape)747 void TestOneDepthwiseConvBisectOutputShift(
748 const TestParam& test_param,
749 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
750 input_data,
751 const RuntimeShape& input_shape, std::int32_t input_offset,
752 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
753 filter_data,
754 const RuntimeShape& filter_shape, std::int32_t filter_offset,
755 const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
756 PaddingType padding_type, int pad_width, int pad_height,
757 int depth_multiplier, std::int32_t output_offset,
758 std::int32_t output_multiplier, const std::int32_t* output_shift_adjust,
759 const std::int32_t* output_multiplier_adjust,
760 int output_activation_bisect_start, int output_activation_bisect_end,
761 std::int32_t output_activation_min, std::int32_t output_activation_max,
762 const RuntimeShape& output_shape) {
763 ASSERT_LT(output_activation_bisect_start, output_activation_bisect_end)
764 << "Bisection failed ?!?!";
765 int output_shift_bisect_midpoint =
766 (output_activation_bisect_start + output_activation_bisect_end) / 2;
767 int bisect_result =
768 TestOneDepthwiseConvWithGivenOutputShift<quantization_type>(
769 test_param, input_data, input_shape, input_offset, filter_data,
770 filter_shape, filter_offset, bias_data, bias_shape, stride,
771 padding_type, pad_width, pad_height, depth_multiplier, output_offset,
772 output_multiplier, output_shift_adjust, output_multiplier_adjust,
773 output_shift_bisect_midpoint, output_activation_min,
774 output_activation_max, output_shape);
775 // At this point we know that the test succeeded (otherwise it would have
776 // aborted).
777 if (bisect_result == 0) {
778 // The result isn't particularly saturated on one or the other side.
779 // All good, we're done.
780 return;
781 }
782 if (output_activation_bisect_start == output_activation_bisect_end - 1) {
783 // There is still some saturation on one side, but the bisection is
784 // finished anyways. We're done; nothing more we can do about it. This
785 // happens
786 // in particular when using an activation with a narrow range.
787 return;
788 }
789 // Continue the bisection based on the present result.
790 int new_output_activation_bisect_start = bisect_result == 1
791 ? output_shift_bisect_midpoint
792 : output_activation_bisect_start;
793 int new_output_activation_bisect_end = bisect_result == 1
794 ? output_activation_bisect_end
795 : output_shift_bisect_midpoint;
796 TestOneDepthwiseConvBisectOutputShift<quantization_type>(
797 test_param, input_data, input_shape, input_offset, filter_data,
798 filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
799 pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
800 output_shift_adjust, output_multiplier_adjust,
801 new_output_activation_bisect_start, new_output_activation_bisect_end,
802 output_activation_min, output_activation_max, output_shape);
803 }
804
805 template <QuantizationType quantization_type>
TestOneDepthwiseConv(const TestParam & test_param,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & input_shape,std::int32_t input_offset,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & filter_shape,std::int32_t filter_offset,const std::int32_t * bias_data,const RuntimeShape & bias_shape,int stride,PaddingType padding_type,int pad_width,int pad_height,int depth_multiplier,std::int32_t output_offset,std::int32_t output_multiplier,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,std::int32_t output_activation_min,std::int32_t output_activation_max,const RuntimeShape & output_shape)806 void TestOneDepthwiseConv(
807 const TestParam& test_param,
808 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
809 input_data,
810 const RuntimeShape& input_shape, std::int32_t input_offset,
811 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
812 filter_data,
813 const RuntimeShape& filter_shape, std::int32_t filter_offset,
814 const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
815 PaddingType padding_type, int pad_width, int pad_height,
816 int depth_multiplier, std::int32_t output_offset,
817 std::int32_t output_multiplier, const std::int32_t* output_shift_adjust,
818 const std::int32_t* output_multiplier_adjust,
819 std::int32_t output_activation_min, std::int32_t output_activation_max,
820 const RuntimeShape& output_shape) {
821 TestOneDepthwiseConvBisectOutputShift<quantization_type>(
822 test_param, input_data, input_shape, input_offset, filter_data,
823 filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
824 pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
825 output_shift_adjust, output_multiplier_adjust, 0, 32,
826 output_activation_min, output_activation_max, output_shape);
827 }
828
TryTestDepthwiseConv(const TestParam & test_param,ParamsSpecialization params_specialization,int batch,int input_depth,int input_width,int input_height,int filter_width,int filter_height,int depth_multiplier,int stride,int dilation_width_factor,int dilation_height_factor,PaddingType padding_type)829 bool TryTestDepthwiseConv(const TestParam& test_param,
830 ParamsSpecialization params_specialization, int batch,
831 int input_depth, int input_width, int input_height,
832 int filter_width, int filter_height,
833 int depth_multiplier, int stride,
834 int dilation_width_factor, int dilation_height_factor,
835 PaddingType padding_type) {
836 const int output_depth = input_depth * depth_multiplier;
837 // The optimized DepthwiseConv implementation currently uses a fixed-size
838 // accumulator buffer on the stack, with that size. This currently means
839 // that it does not support larger output depths. It CHECK's for it,
840 // so it's safe in the sense that if a larger output depth was encountered,
841 // it would explicitly fail. We just need to adjust our testing to that
842 // constraint.
843 const int kMaxSupportedOutputDepth = 1024;
844 if (output_depth > kMaxSupportedOutputDepth) {
845 return false;
846 }
847
848 int output_activation_min;
849 int output_activation_max;
850 std::int32_t output_multiplier;
851 std::int32_t input_offset;
852 std::int32_t output_offset;
853
854 if (test_param.quantization_type == QuantizationType::kNonPerChannelUint8) {
855 output_activation_min = 0;
856 output_activation_max = 255;
857 if (UniformRandomInt(0, 1)) {
858 output_activation_min = UniformRandomInt(0, 50);
859 output_activation_max = UniformRandomInt(200, 255);
860 }
861 output_multiplier =
862 UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
863 input_offset = UniformRandomInt(-255, 0);
864 output_offset = UniformRandomInt(0, 255);
865 } else {
866 output_activation_min = -127;
867 output_activation_max = 127;
868 if (UniformRandomInt(0, 1)) {
869 output_activation_min = UniformRandomInt(-127, -75);
870 output_activation_max = UniformRandomInt(75, 127);
871 }
872 output_multiplier =
873 UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
874 input_offset = UniformRandomInt(-127, 127);
875 output_offset = UniformRandomInt(-127, 127);
876 }
877
878 RuntimeShape input_shape_inference(
879 {batch, input_height, input_width, input_depth});
880 RuntimeShape output_shape_inference;
881 int pad_width, pad_height;
882 if (!ComputeConvSizes(input_shape_inference, output_depth, filter_width,
883 filter_height, stride, dilation_width_factor,
884 dilation_height_factor, padding_type,
885 &output_shape_inference, &pad_width, &pad_height)) {
886 return false;
887 }
888 TFLITE_DCHECK_EQ(output_depth, output_shape_inference.Dims(3));
889
890 RuntimeShape filter_shape_inference(
891 {1, filter_height, filter_width, output_depth});
892 RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
893 const int input_buffer_size = input_shape_inference.FlatSize();
894 const int filter_buffer_size = filter_shape_inference.FlatSize();
895 std::vector<std::int32_t> bias_data(output_depth);
896 FillRandom(&bias_data, -10000, 10000);
897
898 if (test_param.quantization_type == QuantizationType::kPerChannelInt8) {
899 std::vector<std::int8_t> input_data(input_buffer_size);
900 std::vector<std::int8_t> filter_data(filter_buffer_size);
901 FillRandom(&input_data, static_cast<int8>(-127), static_cast<int8>(127));
902 FillRandom(&filter_data, static_cast<int8>(-127), static_cast<int8>(127));
903
904 std::int32_t filter_offset = 0;
905 EXPECT_TRUE(params_specialization == ParamsSpecialization::kSymmetric);
906
907 std::vector<std::int32_t> output_multiplier_adjust(output_depth, 0);
908 std::vector<std::int32_t> output_shift_adjust(output_depth, 0);
909 for (int i = 0; i < output_depth; ++i) {
910 // Thus a good way to randomize multipliers is to subtract from them
911 // a random value smaller than 2^30 but still significant compared to
912 // it.
913 FillRandom(&output_multiplier_adjust, -(1 << 26), 0);
914 FillRandom(&output_shift_adjust, -4, 0);
915 }
916 TestOneDepthwiseConv<QuantizationType::kPerChannelInt8>(
917 test_param, input_data.data(), input_shape_inference, input_offset,
918 filter_data.data(), filter_shape_inference, filter_offset,
919 bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
920 pad_height, depth_multiplier, output_offset, output_multiplier,
921 output_shift_adjust.data(), output_multiplier_adjust.data(),
922 output_activation_min, output_activation_max, output_shape_inference);
923 } else {
924 std::vector<std::uint8_t> input_data(input_buffer_size);
925 std::vector<std::uint8_t> filter_data(filter_buffer_size);
926 FillRandom(&input_data);
927 FillRandom(&filter_data);
928
929 std::int32_t filter_offset = -kSymmetricZeroPoint;
930 if (params_specialization != ParamsSpecialization::kSymmetric) {
931 filter_offset = UniformRandomInt(-255, 0);
932 }
933
934 TestOneDepthwiseConv<QuantizationType::kNonPerChannelUint8>(
935 test_param, input_data.data(), input_shape_inference, input_offset,
936 filter_data.data(), filter_shape_inference, filter_offset,
937 bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
938 pad_height, depth_multiplier, output_offset, output_multiplier,
939 nullptr /*=output_shift_adjust*/, nullptr /*=output_multiplier_adjust*/,
940 output_activation_min, output_activation_max, output_shape_inference);
941 }
942
943 return true;
944 }
945
946 // This function picks some random DepthwiseConv params, which may or may not
947 // be legal. If they're not legal, it returns false. If they're legal,
948 // it runs the DepthwiseConv test and returns true. This allows the caller
949 // to loop until a test has been run.
TryTestOneDepthwiseConv(const TestParam & test_param,ParamsSpecialization params_specialization)950 bool TryTestOneDepthwiseConv(const TestParam& test_param,
951 ParamsSpecialization params_specialization) {
952 // We have to pick a lot of positive values, where we are particularly
953 // interested in small values because they are most likely to be special
954 // cases in optimized implementations, and secondarily because they allow
955 // tests to run fast, which means we can run more tests and get more
956 // coverage.
957 const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
958 const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
959 const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
960 const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
961 const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10);
962 const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10);
963 const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50);
964 const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
965 const int dilation_width_factor = RandomElement(std::vector<int>({1, 2, 4}));
966 const int dilation_height_factor = RandomElement(std::vector<int>({1, 2, 4}));
967 const auto padding_type =
968 UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
969
970 return TryTestDepthwiseConv(
971 test_param, params_specialization, batch, input_depth, input_width,
972 input_height, filter_width, filter_height, depth_multiplier, stride,
973 dilation_width_factor, dilation_height_factor, padding_type);
974 }
975
976 // Tests parameters for the 3x3 filter kernel.
TryTestOneDepthwiseConv3x3Filter(const TestParam & test_param,ParamsSpecialization params_specialization)977 bool TryTestOneDepthwiseConv3x3Filter(
978 const TestParam& test_param, ParamsSpecialization params_specialization) {
979 const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
980 const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
981 int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
982 int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
983 const int filter_width = 3;
984 const int filter_height = 3;
985 const int depth_multiplier = 1;
986 const int stride = UniformRandomInt(1, 2);
987 // We don't support dilations in the 3x3 filter.
988 const int dilation_width_factor = 1;
989 const int dilation_height_factor = 1;
990 const auto padding_type =
991 UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
992
993 // Adjust for, or reject, special cases.
994 if (test_param.forced_invocation != DepthwiseConvImplementation::kNone) {
995 // With stride == 2 and SAME, padding width and height are the left and top
996 // padding amounts. When there is an even input dimension, padding + 1 is
997 // required on the right / bottom. This is not handled by these kernels, so
998 // we bump the input dimensions.
999 if (padding_type == PaddingType::kSame && stride == 2) {
1000 input_width = 2 * (input_width / 2) + 1;
1001 input_height = 2 * (input_height / 2) + 1;
1002 }
1003
1004 // The padded 3x3 kernel (with kSame) does not support input_width == 1 when
1005 // input_height > 1, and vice versa.
1006 if (padding_type == PaddingType::kSame &&
1007 (input_width > 1) != (input_height > 1)) {
1008 return false;
1009 }
1010 }
1011
1012 return TryTestDepthwiseConv(
1013 test_param, params_specialization, batch, input_depth, input_width,
1014 input_height, filter_width, filter_height, depth_multiplier, stride,
1015 dilation_width_factor, dilation_height_factor, padding_type);
1016 }
1017
1018 // Tests with parameters suited to dot-product-NEON 3x3 filter kernels.
TryTestOneNeonDot3x3(const TestParam & test_param,ParamsSpecialization params_specialization)1019 bool TryTestOneNeonDot3x3(const TestParam& test_param,
1020 ParamsSpecialization params_specialization) {
1021 const CoverageExtension coverage_extension = static_cast<CoverageExtension>(
1022 UniformRandomInt(0, static_cast<int>(CoverageExtension::kNumOptions)));
1023
1024 const int batch = 1;
1025 const int input_depth = test_param.test_depth_multiplier
1026 ? 1
1027 : 8 * ExponentialRandomPositiveInt(0.9f, 3, 50);
1028 const int input_width = coverage_extension == CoverageExtension::kLargeWidths
1029 ? ExponentialRandomPositiveInt(0.9f, 50, 200)
1030 : ExponentialRandomPositiveInt(0.9f, 20, 60);
1031 const int input_height =
1032 coverage_extension == CoverageExtension::kLargeHeights
1033 ? ExponentialRandomPositiveInt(0.9f, 50, 200)
1034 : ExponentialRandomPositiveInt(0.9f, 20, 60);
1035 const int filter_width = 3;
1036 const int filter_height = 3;
1037 const int depth_multiplier =
1038 test_param.test_depth_multiplier
1039 ? 8 * ExponentialRandomPositiveInt(0.2f, 1, 9)
1040 : 1;
1041 const int stride = test_param.test_stride ? 2 : 1;
1042 // We don't support dilations in the 3x3 filter.
1043 const int dilation_width_factor = 1;
1044 const int dilation_height_factor = 1;
1045 const auto padding_type =
1046 test_param.test_pad ? PaddingType::kSame : PaddingType::kValid;
1047
1048 return TryTestDepthwiseConv(
1049 test_param, params_specialization, batch, input_depth, input_width,
1050 input_height, filter_width, filter_height, depth_multiplier, stride,
1051 dilation_width_factor, dilation_height_factor, padding_type);
1052 }
1053
TestOneDepthwiseConv(DepthwiseConvImplementation forced_invocation,DepthwiseConvOutputRounding output_rounding)1054 void TestOneDepthwiseConv(DepthwiseConvImplementation forced_invocation,
1055 DepthwiseConvOutputRounding output_rounding) {
1056 TestParam test_param;
1057 test_param.forced_invocation = forced_invocation;
1058 test_param.output_rounding = output_rounding;
1059 while (!TryTestOneDepthwiseConv(test_param, ParamsSpecialization::kNone)) {
1060 }
1061 }
1062
TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation forced_invocation,DepthwiseConvOutputRounding output_rounding)1063 void TestOneDepthwiseConv3x3Filter(
1064 DepthwiseConvImplementation forced_invocation,
1065 DepthwiseConvOutputRounding output_rounding) {
1066 TestParam test_param;
1067 test_param.forced_invocation = forced_invocation;
1068 test_param.output_rounding = output_rounding;
1069 while (!TryTestOneDepthwiseConv3x3Filter(test_param,
1070 ParamsSpecialization::kNone)) {
1071 }
1072 }
1073
TestOneNeonDot3x3(const TestParam & test_param)1074 void TestOneNeonDot3x3(const TestParam& test_param) {
1075 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
1076 defined(__clang__)
1077 CpuFlags cpu_flags;
1078 GetCpuFlags(&cpu_flags);
1079 const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
1080 if (test_param.forced_invocation ==
1081 DepthwiseConvImplementation::kUseNeon3x3DotProduct &&
1082 !has_dot_product_instructions) {
1083 return;
1084 }
1085 #endif
1086
1087 while (!TryTestOneNeonDot3x3(test_param, ParamsSpecialization::kSymmetric)) {
1088 }
1089 }
1090
TEST(TestDepthwiseConv,TestDepthwiseConv)1091 TEST(TestDepthwiseConv, TestDepthwiseConv) {
1092 const int kTestsToRun = 1000;
1093 for (int i = 0; i < kTestsToRun; i++) {
1094 TestOneDepthwiseConv(DepthwiseConvImplementation::kNone,
1095 DepthwiseConvOutputRounding::kAwayFromZero);
1096 }
1097 }
1098
1099 // Run basic coverage test against the generic kernel.
TEST(TestDepthwiseConv,TestGenericKernel)1100 TEST(TestDepthwiseConv, TestGenericKernel) {
1101 const int kTestsToRun = 1000;
1102 for (int i = 0; i < kTestsToRun; i++) {
1103 TestOneDepthwiseConv(DepthwiseConvImplementation::kUseGenericKernel,
1104 DepthwiseConvOutputRounding::kAwayFromZero);
1105 }
1106 }
1107
1108 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
TEST(TestDepthwiseConv,TestNeon3x3FilterAway)1109 TEST(TestDepthwiseConv, TestNeon3x3FilterAway) {
1110 const int kTestsToRun = 500;
1111 for (int i = 0; i < kTestsToRun; i++) {
1112 TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
1113 DepthwiseConvOutputRounding::kAwayFromZero);
1114 }
1115 }
1116
TEST(TestDepthwiseConv,TestNeon3x3FilterUpward)1117 TEST(TestDepthwiseConv, TestNeon3x3FilterUpward) {
1118 const int kTestsToRun = 500;
1119 for (int i = 0; i < kTestsToRun; i++) {
1120 TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
1121 DepthwiseConvOutputRounding::kUpward);
1122 }
1123 }
1124 #endif
1125
1126 // While 3x3 coverage tests are primarily targeted at specialized kernels, we
1127 // also run it against the generic kernel.
TEST(TestDepthwiseConv,TestGenericKernel3x3Filter)1128 TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
1129 const int kTestsToRun = 100;
1130 for (int i = 0; i < kTestsToRun; i++) {
1131 TestOneDepthwiseConv3x3Filter(
1132 DepthwiseConvImplementation::kUseGenericKernel,
1133 DepthwiseConvOutputRounding::kAwayFromZero);
1134 }
1135 }
1136
1137 class DepthwiseConvTest : public ::testing::TestWithParam<TestParamTuple> {};
1138
TEST_P(DepthwiseConvTest,NeonDot3x3)1139 TEST_P(DepthwiseConvTest, NeonDot3x3) {
1140 const TestParam param(GetParam());
1141 for (int i = 0; i < param.tests_to_run; i++) {
1142 TestOneNeonDot3x3(param);
1143 }
1144 }
1145
1146 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
1147 INSTANTIATE_TEST_SUITE_P(
1148 Neon3x3KernelAway, DepthwiseConvTest,
1149 testing::Combine(
1150 Values(DepthwiseConvImplementation::kUseNeon3x3), // forced_invocation
1151 Values(500), // tests_to_run
1152 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1153 Bool(), // test_stride
1154 Values(false), // test_pad
1155 Values(false), // test_depth_multiplier
1156 Values(DepthwiseConvOutputRounding::kAwayFromZero), // output_rounding
1157 Values(1), // num_threads
1158 Values(false) // loose_tolerance
1159 ),
1160 TestParam::TestNameSuffix);
1161
1162 INSTANTIATE_TEST_SUITE_P(
1163 Neon3x3KernelUpward, DepthwiseConvTest,
1164 testing::Combine(
1165 Values(DepthwiseConvImplementation::kUseNeon3x3), // forced_invocation
1166 Values(500), // tests_to_run
1167 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1168 Bool(), // test_stride
1169 Values(false), // test_pad
1170 Values(false), // test_depth_multiplier
1171 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1172 Values(1), // num_threads
1173 Values(false) // loose_tolerance
1174 ),
1175 TestParam::TestNameSuffix);
1176
1177 INSTANTIATE_TEST_SUITE_P(
1178 Neon3x3KernelUpwardPerChannel, DepthwiseConvTest,
1179 testing::Combine(
1180 Values(DepthwiseConvImplementation::kUseNeon3x3), // forced_invocation
1181 Values(500), // tests_to_run
1182 Values(QuantizationType::kPerChannelInt8), // quantization_type
1183 Bool(), // test_stride
1184 Values(false), // test_pad
1185 Values(false), // test_depth_multiplier
1186 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1187 Values(1), // num_threads
1188 Values(false) // loose_tolerance
1189 ),
1190 TestParam::TestNameSuffix);
1191 #endif // __aarch64__ && !GOOGLE_L4T
1192
1193 // While 3x3 coverage tests are primarily targeted at specialized kernels, we
1194 // also run it against the generic kernel.
1195 INSTANTIATE_TEST_SUITE_P(
1196 GenericKernel, DepthwiseConvTest,
1197 testing::Combine(
1198 Values(DepthwiseConvImplementation::
1199 kUseGenericKernel), // forced_invocation
1200 Values(100), // tests_to_run
1201 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1202 Bool(), // test_stride
1203 Bool(), // test_pad
1204 Bool(), // test_depth_multiplier
1205 Values(DepthwiseConvOutputRounding::kAwayFromZero), // output_rounding
1206 Values(1), // num_threads
1207 Values(false) // loose_tolerance
1208 ),
1209 TestParam::TestNameSuffix);
1210
1211 INSTANTIATE_TEST_SUITE_P(
1212 GenericKernelPerChannel, DepthwiseConvTest,
1213 testing::Combine(
1214 Values(DepthwiseConvImplementation::
1215 kUseGenericKernel), // forced_invocation
1216 Values(100), // tests_to_run
1217 Values(QuantizationType::kPerChannelInt8), // quantization_type
1218 Bool(), // test_stride
1219 Bool(), // test_pad
1220 Bool(), // test_depth_multiplier
1221 Values(DepthwiseConvOutputRounding::kAwayFromZero), // output_rounding
1222 Values(1), // num_threads
1223 Values(false) // loose_tolerance
1224 ),
1225 TestParam::TestNameSuffix);
1226
1227 INSTANTIATE_TEST_SUITE_P(
1228 CModel, DepthwiseConvTest,
1229 testing::Combine(
1230 Values(DepthwiseConvImplementation::
1231 kUseCModel3x3DotProduct), // forced_invocation
1232 Values(200), // tests_to_run
1233 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1234 Bool(), // test_stride
1235 Bool(), // test_pad
1236 Bool(), // test_depth_multiplier
1237 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1238 Values(1), // num_threads
1239 Values(false) // loose_tolerance
1240 ),
1241 TestParam::TestNameSuffix);
1242
1243 INSTANTIATE_TEST_SUITE_P(
1244 Unwound, DepthwiseConvTest,
1245 testing::Combine(
1246 Values(DepthwiseConvImplementation::
1247 kUseUnwound3x3DotProduct), // forced_invocation
1248 Values(200), // tests_to_run
1249 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1250 Bool(), // test_stride
1251 Bool(), // test_pad
1252 Bool(), // test_depth_multiplier
1253 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1254 Values(1), // num_threads
1255 Values(false) // loose_tolerance
1256 ),
1257 TestParam::TestNameSuffix);
1258
1259 #if defined(USE_NEON)
1260 // Intrinsics tests are run in emulation mode (such as for dot-product
1261 // instructions) unless the tests are built specifically with dot-product
1262 // instructions enabled.
1263 INSTANTIATE_TEST_SUITE_P(
1264 Intrinsics, DepthwiseConvTest,
1265 testing::Combine(
1266 Values(DepthwiseConvImplementation::
1267 kUseIntrinsics3x3DotProduct), // forced_invocation
1268 Values(200), // tests_to_run
1269 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1270 Bool(), // test_stride
1271 Bool(), // test_pad
1272 Bool(), // test_depth_multiplier
1273 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1274 Values(1), // num_threads
1275 Values(kLooseIntrinsicsTolerance) // loose_tolerance
1276 ),
1277 TestParam::TestNameSuffix);
1278
1279 // TODO(b/148145875): Remove this extra guard after checking that code runs
1280 // without lax vector conversions.
1281 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
1282 defined(__clang__)
1283 INSTANTIATE_TEST_SUITE_P(
1284 IntrinsicsPerChannel, DepthwiseConvTest,
1285 testing::Combine(
1286 Values(DepthwiseConvImplementation::
1287 kUseIntrinsics3x3DotProduct), // forced_invocation
1288 Values(200), // tests_to_run
1289 Values(QuantizationType::kPerChannelInt8), // quantization_type
1290 Bool(), // test_stride
1291 Bool(), // test_pad
1292 Bool(), // test_depth_multiplier
1293 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1294 Values(1), // num_threads
1295 Values(kLooseIntrinsicsTolerance) // loose_tolerance
1296 ),
1297 TestParam::TestNameSuffix);
1298 #endif
1299
1300 #endif
1301
1302 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
1303 defined(__clang__)
1304 INSTANTIATE_TEST_SUITE_P(
1305 NeonAsm, DepthwiseConvTest,
1306 testing::Combine(
1307 Values(DepthwiseConvImplementation::
1308 kUseNeon3x3DotProduct), // forced_invocation
1309 Values(200), // tests_to_run
1310 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1311 Bool(), // test_stride
1312 Bool(), // test_pad
1313 Bool(), // test_depth_multiplier
1314 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1315 Values(1), // num_threads
1316 Values(false) // loose_tolerance
1317 ),
1318 TestParam::TestNameSuffix);
1319
1320 INSTANTIATE_TEST_SUITE_P(
1321 NeonAsmPerChannel, DepthwiseConvTest,
1322 testing::Combine(
1323 Values(DepthwiseConvImplementation::
1324 kUseNeon3x3DotProduct), // forced_invocation
1325 Values(200), // tests_to_run
1326 Values(QuantizationType::kPerChannelInt8), // quantization_type
1327 Bool(), // test_stride
1328 Bool(), // test_pad
1329 Bool(), // test_depth_multiplier
1330 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1331 Values(1), // num_threads
1332 Values(false) // loose_tolerance
1333 ),
1334 TestParam::TestNameSuffix);
1335
1336 // Apply the 3x3 tests through the dispatch.
1337 // Also test multi-threading. This assumes upward rounding.
1338 INSTANTIATE_TEST_SUITE_P(
1339 Dispatch3x3, DepthwiseConvTest,
1340 testing::Combine(
1341 Values(DepthwiseConvImplementation::kNone), // forced_invocation
1342 Values(200), // tests_to_run
1343 Values(QuantizationType::kNonPerChannelUint8), // quantization_type
1344 Bool(), // test_stride
1345 Bool(), // test_pad
1346 Bool(), // test_depth_multiplier
1347 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1348 Values(4), // num_threads
1349 Values(false) // loose_tolerance
1350 ),
1351 TestParam::TestNameSuffix);
1352
1353 INSTANTIATE_TEST_SUITE_P(
1354 Dispatch3x3PerChannel, DepthwiseConvTest,
1355 testing::Combine(
1356 Values(DepthwiseConvImplementation::kNone), // forced_invocation
1357 Values(200), // tests_to_run
1358 Values(QuantizationType::kPerChannelInt8), // quantization_type
1359 Bool(), // test_stride
1360 Bool(), // test_pad
1361 Bool(), // test_depth_multiplier
1362 Values(DepthwiseConvOutputRounding::kUpward), // output_rounding
1363 Values(4), // num_threads
1364 Values(false) // loose_tolerance
1365 ),
1366 TestParam::TestNameSuffix);
1367 #endif
1368
1369 } // namespace
1370 } // namespace tflite
1371