1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <sys/types.h>
16 
17 #include <algorithm>
18 #include <cmath>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 #include <vector>
25 
26 #include <gtest/gtest.h>
27 #include "ruy/context.h"  // from @ruy
28 #include "tensorflow/lite/kernels/cpu_backend_context.h"
29 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
30 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
31 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
32 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
33 #include "tensorflow/lite/kernels/internal/test_util.h"
34 #include "tensorflow/lite/kernels/internal/types.h"
35 
36 #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
37 #include "absl/strings/substitute.h"
38 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h"
39 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
40 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
41 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h"
42 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
43 
44 namespace tflite {
45 namespace {
46 
47 using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
48 using optimized_ops::depthwise_conv::QuantizationType;
49 using optimized_ops::depthwise_conv::QuantizationTypeImpl;
50 using ::testing::Bool;
51 using ::testing::Values;
52 
53 #if defined(__aarch64__)
54 static constexpr bool kLooseIntrinsicsTolerance = false;
55 #else
56 static constexpr bool kLooseIntrinsicsTolerance = true;
57 #endif
58 
59 // Currently, this is used in place of a Boolean "is symmetric?".
60 enum class ParamsSpecialization {
61   kNone = 0,
62   kSymmetric,  // Symmetric quantization: zero represented by 128.
63 };
64 
65 static constexpr int kSymmetricZeroPoint = 128;
66 
67 // Extend coverage distribution in a specific aspect, either explicitly chosen
68 // or randomly chosen as in a mixture distribution.
69 enum class CoverageExtension {
70   kNone = 0,
71   kLargeHeights = 1,
72   kLargeWidths = 2,
73   kNumOptions
74 };
75 
76 // The TestParam structure below is the preferred parameterization of tests. A
77 // tuple version is defined in order to support value-parameterized tests.
78 typedef std::tuple<DepthwiseConvImplementation, int, QuantizationType, bool,
79                    bool, bool, DepthwiseConvOutputRounding, int, bool>
80     TestParamTuple;
81 
82 struct TestParam {
83   TestParam() = default;
84 
TestParamtflite::__anon9426c1610111::TestParam85   explicit TestParam(TestParamTuple param_tuple)
86       : forced_invocation(::testing::get<0>(param_tuple)),
87         tests_to_run(::testing::get<1>(param_tuple)),
88         quantization_type(::testing::get<2>(param_tuple)),
89         test_stride(::testing::get<3>(param_tuple)),
90         test_pad(::testing::get<4>(param_tuple)),
91         test_depth_multiplier(::testing::get<5>(param_tuple)),
92         output_rounding(::testing::get<6>(param_tuple)),
93         num_threads(::testing::get<7>(param_tuple)),
94         loose_tolerance(::testing::get<8>(param_tuple)) {}
95 
TestNameSuffixtflite::__anon9426c1610111::TestParam96   static std::string TestNameSuffix(
97       const ::testing::TestParamInfo<TestParamTuple>& info) {
98     const TestParam param(info.param);
99     return absl::Substitute(
100         "invocation_$0_quantization_$1_stride_$2_pad_$3_depth_mult_$4",
101         static_cast<int>(param.forced_invocation),
102         static_cast<int>(param.quantization_type), param.test_stride,
103         param.test_pad, param.test_depth_multiplier);
104   }
105 
106   DepthwiseConvImplementation forced_invocation =
107       DepthwiseConvImplementation::kNone;
108   int tests_to_run = 0;
109   QuantizationType quantization_type = QuantizationType::kNonPerChannelUint8;
110   bool test_stride = false;
111   bool test_pad = false;
112   bool test_depth_multiplier = false;
113   DepthwiseConvOutputRounding output_rounding =
114       DepthwiseConvOutputRounding::kNone;
115   int num_threads = 1;
116   bool loose_tolerance = false;
117 };
118 
119 template <QuantizationType quantization_type>
DispatchDepthwiseConvGeneral(const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,const RuntimeShape & output_shape,typename QuantizationTypeImpl<quantization_type>::ExternalType * output_data,int thread_start,int thread_end,int thread_dim)120 inline void DispatchDepthwiseConvGeneral(
121     const DepthwiseParams& params, const RuntimeShape& input_shape,
122     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
123         input_data,
124     const RuntimeShape& filter_shape,
125     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
126         filter_data,
127     const RuntimeShape& bias_shape, const int32* bias_data,
128     const std::int32_t* output_shift_adjust,
129     const std::int32_t* output_multiplier_adjust,
130     const RuntimeShape& output_shape,
131     typename QuantizationTypeImpl<quantization_type>::ExternalType* output_data,
132     int thread_start, int thread_end, int thread_dim) {
133   optimized_ops::depthwise_conv::DepthwiseConvGeneral(
134       params, input_shape, input_data, filter_shape, filter_data, bias_shape,
135       bias_data, output_shape, output_data, thread_start, thread_end,
136       thread_dim);
137 }
138 
139 template <>
DispatchDepthwiseConvGeneral(const DepthwiseParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,const RuntimeShape & output_shape,int8 * output_data,int thread_start,int thread_end,int thread_dim)140 inline void DispatchDepthwiseConvGeneral<QuantizationType::kPerChannelInt8>(
141     const DepthwiseParams& params, const RuntimeShape& input_shape,
142     const int8* input_data, const RuntimeShape& filter_shape,
143     const int8* filter_data, const RuntimeShape& bias_shape,
144     const int32* bias_data, const std::int32_t* output_shift_adjust,
145     const std::int32_t* output_multiplier_adjust,
146     const RuntimeShape& output_shape, int8* output_data, int thread_start,
147     int thread_end, int thread_dim) {
148   optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(
149       params, output_multiplier_adjust, output_shift_adjust, input_shape,
150       input_data, filter_shape, filter_data, bias_shape, bias_data,
151       output_shape, output_data, thread_start, thread_end, thread_dim);
152 }
153 
154 template <QuantizationType quantization_type>
DispatchDepthwiseConvImpl(const TestParam & test_param,const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,typename QuantizationTypeImpl<quantization_type>::ExternalType * output_data)155 inline void DispatchDepthwiseConvImpl(
156     const TestParam& test_param, const DepthwiseParams& params,
157     const RuntimeShape& input_shape,
158     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
159         input_data,
160     const RuntimeShape& filter_shape,
161     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
162         filter_data,
163     const RuntimeShape& bias_shape, const int32* bias_data,
164     const RuntimeShape& output_shape,
165     typename QuantizationTypeImpl<quantization_type>::ExternalType*
166         output_data) {
167   switch (test_param.forced_invocation) {
168     case DepthwiseConvImplementation::kUseNeon3x3: {
169 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
170 // Jetson TX-2. This compiler does not support the offsetof() macro.
171 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
172       const int stride_width = params.stride_width;
173       const int stride_height = params.stride_height;
174       const int pad_width = params.padding_values.width;
175       const int pad_height = params.padding_values.height;
176       const int output_shift = params.output_shift;
177       const int depth_multiplier = params.depth_multiplier;
178       const int dilation_width_factor = params.dilation_width_factor;
179       const int dilation_height_factor = params.dilation_height_factor;
180 
181       // Check that parameter combination is supported.
182       const bool basic_3x3_kernel_supported =
183           optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported(
184               input_shape, filter_shape, stride_width, stride_height,
185               dilation_width_factor, dilation_height_factor, pad_width,
186               pad_height, depth_multiplier, output_shape, output_shift);
187       ASSERT_TRUE(basic_3x3_kernel_supported)
188           << "pad_width = " << params.padding_values.width
189           << " pad_height = " << params.padding_values.height
190           << " input_width = " << input_shape.Dims(2)
191           << " input_height = " << input_shape.Dims(1)
192           << " output_width = " << output_shape.Dims(2)
193           << " output_height = " << output_shape.Dims(1);
194 
195       // Call kernel optimized for depthwise convolutions using 3x3 filters.
196       switch (test_param.output_rounding) {
197         case DepthwiseConvOutputRounding::kAwayFromZero:
198           optimized_ops::depthwise_conv::DepthwiseConv3x3Filter<
199               DepthwiseConvOutputRounding::kAwayFromZero>(
200               params, input_shape, input_data, filter_shape, filter_data,
201               bias_shape, bias_data, output_shape, output_data,
202               /*thread_start=*/0,
203               /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
204           return;
205         case DepthwiseConvOutputRounding::kUpward:
206           optimized_ops::depthwise_conv::DepthwiseConv3x3Filter<
207               DepthwiseConvOutputRounding::kUpward>(
208               params, input_shape, input_data, filter_shape, filter_data,
209               bias_shape, bias_data, output_shape, output_data,
210               /*thread_start=*/0,
211               /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
212           return;
213         default:
214           break;
215       }
216 #endif
217       break;
218     }
219     case DepthwiseConvImplementation::kUseNeon3x3DotProduct: {
220       // This is compiled-in even if dot-product instructions are unavailable.
221       // However, tests should skip dot-product testing in that case and not
222       // call this code.
223 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
224     defined(__clang__)
225       DotProduct3x3KernelType kernel_type =
226           optimized_ops::depthwise_conv::CategorizeDotProductKernel(
227               input_shape, filter_shape, output_shape, params);
228 
229       ASSERT_NE(kernel_type, DotProduct3x3KernelType::kNone)
230           << "Kernel type = " << static_cast<int>(kernel_type);
231 
232       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
233           DepthwiseConvImplementation::kUseNeon3x3DotProduct,
234           quantization_type>(
235           params, input_shape, input_data, filter_shape, filter_data,
236           bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
237           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
238       return;
239 #endif
240       break;
241     }
242     case DepthwiseConvImplementation::kUseCModel3x3DotProduct: {
243       DotProduct3x3KernelType kernel_type =
244           optimized_ops::depthwise_conv::CategorizeDotProductKernel(
245               input_shape, filter_shape, output_shape, params);
246 
247       ASSERT_TRUE(
248           kernel_type == DotProduct3x3KernelType::kPlain ||
249           kernel_type == DotProduct3x3KernelType::kStride2 ||
250           kernel_type ==
251               DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
252           kernel_type ==
253               DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
254           << "Kernel type = " << static_cast<int>(kernel_type)
255           << " depth_multiplier = " << params.depth_multiplier
256           << " pad_width = " << params.padding_values.width
257           << " pad_height = " << params.padding_values.height
258           << " stride_width = " << params.stride_width
259           << " stride_height = " << params.stride_height
260           << " input_width = " << input_shape.Dims(2)
261           << " input_height = " << input_shape.Dims(1)
262           << " output_width = " << output_shape.Dims(2)
263           << " output_height = " << output_shape.Dims(1)
264           << " depth = " << input_shape.Dims(3)
265           << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
266           << " input_offset = " << params.input_offset;
267 
268       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
269           DepthwiseConvImplementation::kUseCModel3x3DotProduct,
270           quantization_type>(
271           params, input_shape, input_data, filter_shape, filter_data,
272           bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
273           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
274       return;
275     }
276     case DepthwiseConvImplementation::kUseUnwound3x3DotProduct: {
277       DotProduct3x3KernelType kernel_type =
278           optimized_ops::depthwise_conv::CategorizeDotProductKernel(
279               input_shape, filter_shape, output_shape, params);
280       ASSERT_TRUE(
281           kernel_type == DotProduct3x3KernelType::kPlain ||
282           kernel_type == DotProduct3x3KernelType::kStride2 ||
283           kernel_type ==
284               DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
285           kernel_type ==
286               DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
287       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
288           DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
289           quantization_type>(
290           params, input_shape, input_data, filter_shape, filter_data,
291           bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
292           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
293       return;
294     }
295     case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct: {
296 #if defined(USE_NEON)
297       DotProduct3x3KernelType kernel_type =
298           optimized_ops::depthwise_conv::CategorizeDotProductKernel(
299               input_shape, filter_shape, output_shape, params);
300 
301       ASSERT_TRUE(
302           kernel_type == DotProduct3x3KernelType::kPlain ||
303           kernel_type == DotProduct3x3KernelType::kStride2 ||
304           kernel_type ==
305               DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
306           kernel_type ==
307               DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
308       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
309           DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
310           quantization_type>(
311           params, input_shape, input_data, filter_shape, filter_data,
312           bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
313           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
314       return;
315 #else
316       break;
317 #endif
318     }
319     case DepthwiseConvImplementation::kUseGenericKernel: {
320       DispatchDepthwiseConvGeneral<quantization_type>(
321           params, input_shape, input_data, filter_shape, filter_data,
322           bias_shape, bias_data, nullptr, nullptr, output_shape, output_data,
323           /*thread_start=*/0,
324           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
325       return;
326     }
327     case DepthwiseConvImplementation::kNone:
328     default:
329       break;
330   }
331 
332   EXPECT_EQ(test_param.forced_invocation, DepthwiseConvImplementation::kNone)
333       << "TODO(b/118426582) requested kernel was not invoked / available yet: "
334       << " forced_invocation = "
335       << static_cast<int>(test_param.forced_invocation)
336       << " depth_multiplier = " << params.depth_multiplier
337       << " pad_width = " << params.padding_values.width
338       << " pad_height = " << params.padding_values.height
339       << " stride_width = " << params.stride_width
340       << " stride_height = " << params.stride_height
341       << " input_width = " << input_shape.Dims(2)
342       << " input_height = " << input_shape.Dims(1)
343       << " output_width = " << output_shape.Dims(2)
344       << " output_height = " << output_shape.Dims(1)
345       << " depth = " << input_shape.Dims(3)
346       << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
347       << " input_offset = " << params.input_offset;
348 
349   CpuBackendContext backend_context;
350   backend_context.SetMaxNumThreads(test_param.num_threads);
351   optimized_ops::DepthwiseConv<
352       typename QuantizationTypeImpl<quantization_type>::ExternalType, int32>(
353       params, input_shape, input_data, filter_shape, filter_data, bias_shape,
354       bias_data, output_shape, output_data, &backend_context);
355 }
356 
357 template <>
DispatchDepthwiseConvImpl(const TestParam & test_param,const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<QuantizationType::kPerChannelInt8>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<QuantizationType::kPerChannelInt8>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,typename QuantizationTypeImpl<QuantizationType::kPerChannelInt8>::ExternalType * output_data)358 inline void DispatchDepthwiseConvImpl<QuantizationType::kPerChannelInt8>(
359     const TestParam& test_param, const DepthwiseParams& params,
360     const RuntimeShape& input_shape,
361     const typename QuantizationTypeImpl<
362         QuantizationType::kPerChannelInt8>::ExternalType* input_data,
363     const RuntimeShape& filter_shape,
364     const typename QuantizationTypeImpl<
365         QuantizationType::kPerChannelInt8>::ExternalType* filter_data,
366     const RuntimeShape& bias_shape, const int32* bias_data,
367     const RuntimeShape& output_shape,
368     typename QuantizationTypeImpl<
369         QuantizationType::kPerChannelInt8>::ExternalType* output_data) {
370   static constexpr QuantizationType quantization_type =
371       QuantizationType::kPerChannelInt8;
372 
373   switch (test_param.forced_invocation) {
374     case DepthwiseConvImplementation::kUseNeon3x3: {
375 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
376 // Jetson TX-2. This compiler does not support the offsetof() macro.
377 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
378       const int stride_width = params.stride_width;
379       const int stride_height = params.stride_height;
380       const int pad_width = params.padding_values.width;
381       const int pad_height = params.padding_values.height;
382       const int output_shift = params.output_shift;
383       const int depth_multiplier = params.depth_multiplier;
384       const int dilation_width_factor = params.dilation_width_factor;
385       const int dilation_height_factor = params.dilation_height_factor;
386 
387       // Check that parameter combination is supported.
388       const bool basic_3x3_kernel_supported =
389           optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported(
390               input_shape, filter_shape, stride_width, stride_height,
391               dilation_width_factor, dilation_height_factor, pad_width,
392               pad_height, depth_multiplier, output_shape, output_shift);
393       ASSERT_TRUE(basic_3x3_kernel_supported)
394           << "pad_width = " << params.padding_values.width
395           << " pad_height = " << params.padding_values.height
396           << " input_width = " << input_shape.Dims(2)
397           << " input_height = " << input_shape.Dims(1)
398           << " output_width = " << output_shape.Dims(2)
399           << " output_height = " << output_shape.Dims(1);
400 
401       // Call kernel optimized for depthwise convolutions using 3x3 filters.
402       switch (test_param.output_rounding) {
403         case DepthwiseConvOutputRounding::kUpward:
404           optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
405               DepthwiseConvOutputRounding::kUpward>(
406               params, params.output_multiplier_per_channel,
407               params.output_shift_per_channel, input_shape, input_data,
408               filter_shape, filter_data, bias_shape, bias_data, output_shape,
409               output_data,
410               /*thread_start=*/0,
411               /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
412           return;
413         case DepthwiseConvOutputRounding::kAwayFromZero:
414         default:
415           // Error case, unsupported. This break sends execution down to
416           // comparison with DepthwiseConvImplementation::kNone later that
417           // reports failure.
418           break;
419       }
420 #endif
421       break;
422     }
423     case DepthwiseConvImplementation::kUseNeon3x3DotProduct: {
424       // This is compiled-in even if dot-product instructions are unavailable.
425       // However, tests should skip dot-product testing in that case and not
426       // call this code.
427 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
428     defined(__clang__)
429       DotProduct3x3KernelType kernel_type =
430           optimized_ops::depthwise_conv::CategorizeDotProductKernel<
431               QuantizationType::kPerChannelInt8>(
432               input_shape, filter_shape, output_shape, params,
433               params.output_shift_per_channel);
434 
435       ASSERT_NE(kernel_type, DotProduct3x3KernelType::kNone)
436           << "Kernel type = " << static_cast<int>(kernel_type);
437 
438       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
439           DepthwiseConvImplementation::kUseNeon3x3DotProduct,
440           quantization_type>(
441           params, input_shape, input_data, filter_shape, filter_data,
442           bias_shape, bias_data, output_shape, output_data,
443           /*thread_start=*/0,
444           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
445       return;
446 #endif
447       break;
448     }
449     case DepthwiseConvImplementation::kUseCModel3x3DotProduct:
450     case DepthwiseConvImplementation::kUseUnwound3x3DotProduct:
451       break;
452     case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct: {
453 #if defined(USE_NEON)
454       DotProduct3x3KernelType kernel_type =
455           optimized_ops::depthwise_conv::CategorizeDotProductKernel<
456               QuantizationType::kPerChannelInt8>(
457               input_shape, filter_shape, output_shape, params,
458               params.output_shift_per_channel);
459 
460       ASSERT_TRUE(
461           kernel_type == DotProduct3x3KernelType::kPlain ||
462           kernel_type == DotProduct3x3KernelType::kStride2 ||
463           kernel_type ==
464               DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
465           kernel_type ==
466               DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
467       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
468           DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
469           quantization_type>(
470           params, input_shape, input_data, filter_shape, filter_data,
471           bias_shape, bias_data, output_shape, output_data,
472           /*thread_start=*/0,
473           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
474       return;
475 #else
476       break;
477 #endif
478     }
479     case DepthwiseConvImplementation::kUseGenericKernel: {
480       EXPECT_NE(params.output_multiplier_per_channel, nullptr);
481       EXPECT_NE(params.output_shift_per_channel, nullptr);
482       DispatchDepthwiseConvGeneral<quantization_type>(
483           params, input_shape, input_data, filter_shape, filter_data,
484           bias_shape, bias_data, params.output_shift_per_channel,
485           params.output_multiplier_per_channel, output_shape, output_data,
486           /*thread_start=*/0,
487           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
488       return;
489     }
490     case DepthwiseConvImplementation::kNone:
491     default:
492       break;
493   }
494 
495   EXPECT_EQ(test_param.forced_invocation, DepthwiseConvImplementation::kNone)
496       << "Requested kernel was not invoked / available yet: "
497       << " forced_invocation = "
498       << static_cast<int>(test_param.forced_invocation)
499       << " depth_multiplier = " << params.depth_multiplier
500       << " pad_width = " << params.padding_values.width
501       << " pad_height = " << params.padding_values.height
502       << " stride_width = " << params.stride_width
503       << " stride_height = " << params.stride_height
504       << " input_width = " << input_shape.Dims(2)
505       << " input_height = " << input_shape.Dims(1)
506       << " output_width = " << output_shape.Dims(2)
507       << " output_height = " << output_shape.Dims(1)
508       << " depth = " << input_shape.Dims(3)
509       << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
510       << " input_offset = " << params.input_offset;
511 
512   EXPECT_NE(params.output_multiplier_per_channel, nullptr);
513   EXPECT_NE(params.output_shift_per_channel, nullptr);
514 
515   CpuBackendContext backend_context;
516   backend_context.SetMaxNumThreads(test_param.num_threads);
517   optimized_integer_ops::DepthwiseConvPerChannel(
518       params, params.output_multiplier_per_channel,
519       params.output_shift_per_channel, input_shape, input_data, filter_shape,
520       filter_data, bias_shape, bias_data, output_shape, output_data,
521       &backend_context);
522 }
523 
524 template <QuantizationType quantization_type>
DispatchDepthwiseConv(const TestParam & test_param,const DepthwiseParams & params,const RuntimeShape & input_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & filter_shape,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,typename QuantizationTypeImpl<quantization_type>::ExternalType * output_data)525 inline void DispatchDepthwiseConv(
526     const TestParam& test_param, const DepthwiseParams& params,
527     const RuntimeShape& input_shape,
528     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
529         input_data,
530     const RuntimeShape& filter_shape,
531     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
532         filter_data,
533     const RuntimeShape& bias_shape, const int32* bias_data,
534     const RuntimeShape& output_shape,
535     typename QuantizationTypeImpl<quantization_type>::ExternalType*
536         output_data) {
537   DispatchDepthwiseConvImpl<quantization_type>(
538       test_param, params, input_shape, input_data, filter_shape, filter_data,
539       bias_shape, bias_data, output_shape, output_data);
540 }
541 
542 template <QuantizationType quantization_type>
543 struct ReferenceRunner {};
544 
545 template <>
546 struct ReferenceRunner<QuantizationType::kNonPerChannelUint8> {
Runtflite::__anon9426c1610111::ReferenceRunner547   static inline void Run(
548       const TestParam& test_param, const tflite::DepthwiseParams& op_params,
549       const uint8* input_data, const RuntimeShape& input_shape,
550       const uint8* filter_data, const RuntimeShape& filter_shape,
551       const std::int32_t* bias_data, const RuntimeShape& bias_shape,
552       const RuntimeShape& output_shape, uint8* reference_output_data) {
553     switch (test_param.output_rounding) {
554       case DepthwiseConvOutputRounding::kUpward:
555         reference_ops::depthwise_conv::DepthwiseConvBasicKernel<
556             DepthwiseConvOutputRounding::kUpward>::Run(op_params, input_shape,
557                                                        input_data, filter_shape,
558                                                        filter_data, bias_shape,
559                                                        bias_data, output_shape,
560                                                        reference_output_data);
561         break;
562       case DepthwiseConvOutputRounding::kAwayFromZero:
563         reference_ops::DepthwiseConv(
564             op_params, input_shape, input_data, filter_shape, filter_data,
565             bias_shape, bias_data, output_shape, reference_output_data);
566         break;
567       case DepthwiseConvOutputRounding::kNone:
568       default:
569         EXPECT_NE(test_param.output_rounding,
570                   DepthwiseConvOutputRounding::kNone);
571         break;
572     }
573   }
574 };
575 
576 template <>
577 struct ReferenceRunner<QuantizationType::kPerChannelInt8> {
Runtflite::__anon9426c1610111::ReferenceRunner578   static inline void Run(
579       const TestParam& test_param, const tflite::DepthwiseParams& op_params,
580       const int8* input_data, const RuntimeShape& input_shape,
581       const int8* filter_data, const RuntimeShape& filter_shape,
582       const std::int32_t* bias_data, const RuntimeShape& bias_shape,
583       const RuntimeShape& output_shape, int8* reference_output_data) {
584     switch (test_param.output_rounding) {
585       case DepthwiseConvOutputRounding::kUpward:
586         reference_ops::depthwise_conv::DepthwiseConvBasicKernel<
587             DepthwiseConvOutputRounding::kUpward>::
588             RunPerChannel(op_params, input_shape, input_data, filter_shape,
589                           filter_data, bias_shape, bias_data, output_shape,
590                           reference_output_data);
591         break;
592       case DepthwiseConvOutputRounding::kAwayFromZero:
593         reference_integer_ops::DepthwiseConvPerChannel(
594             op_params, op_params.output_multiplier_per_channel,
595             op_params.output_shift_per_channel, input_shape, input_data,
596             filter_shape, filter_data, bias_shape, bias_data, output_shape,
597             reference_output_data);
598         break;
599       case DepthwiseConvOutputRounding::kNone:
600       default:
601         EXPECT_NE(test_param.output_rounding,
602                   DepthwiseConvOutputRounding::kNone);
603         break;
604     }
605   }
606 };
607 
608 template <QuantizationType quantization_type>
609 // Runs the DepthwiseConv and compares against the reference implementation.
TestOneDepthwiseConvWithGivenOutputShift(const TestParam & test_param,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & input_shape,std::int32_t input_offset,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & filter_shape,std::int32_t filter_offset,const std::int32_t * bias_data,const RuntimeShape & bias_shape,int stride,PaddingType padding_type,int pad_width,int pad_height,int depth_multiplier,std::int32_t output_offset,std::int32_t output_multiplier,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,int output_shift,std::int32_t output_activation_min,std::int32_t output_activation_max,const RuntimeShape & output_shape)610 int TestOneDepthwiseConvWithGivenOutputShift(
611     const TestParam& test_param,
612     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
613         input_data,
614     const RuntimeShape& input_shape, std::int32_t input_offset,
615     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
616         filter_data,
617     const RuntimeShape& filter_shape, std::int32_t filter_offset,
618     const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
619     PaddingType padding_type, int pad_width, int pad_height,
620     int depth_multiplier, std::int32_t output_offset,
621     std::int32_t output_multiplier, const std::int32_t* output_shift_adjust,
622     const std::int32_t* output_multiplier_adjust, int output_shift,
623     std::int32_t output_activation_min, std::int32_t output_activation_max,
624     const RuntimeShape& output_shape) {
625   const int output_buffer_size = output_shape.FlatSize();
626   std::vector<typename QuantizationTypeImpl<quantization_type>::ExternalType>
627       output_data(output_buffer_size, 42);
628   std::vector<typename QuantizationTypeImpl<quantization_type>::ExternalType>
629       reference_output_data(output_buffer_size);
630 
631   tflite::DepthwiseParams op_params;
632   op_params.padding_type = padding_type;
633   op_params.padding_values.width = pad_width;
634   op_params.padding_values.height = pad_height;
635   op_params.stride_width = stride;
636   op_params.stride_height = stride;
637   op_params.dilation_width_factor = 1;
638   op_params.dilation_height_factor = 1;
639   op_params.depth_multiplier = depth_multiplier;
640   op_params.quantized_activation_min = output_activation_min;
641   op_params.quantized_activation_max = output_activation_max;
642   op_params.input_offset = input_offset;
643   op_params.weights_offset = filter_offset;
644   op_params.output_offset = output_offset;
645   op_params.output_multiplier = output_multiplier;
646   op_params.output_shift = -output_shift;
647 
648   const int depth = output_shape.Dims(3);
649   std::vector<int32> output_multiplier_per_channel(depth, output_multiplier);
650   std::vector<int32> output_shift_per_channel(depth, -output_shift);
651   if (output_multiplier_adjust != nullptr) {
652     for (int i = 0; i < depth; ++i) {
653       output_multiplier_per_channel[i] += output_multiplier_adjust[i];
654       output_shift_per_channel[i] += output_shift_adjust[i];
655       output_shift_per_channel[i] = std::max(-31, output_shift_per_channel[i]);
656     }
657   }
658   op_params.output_multiplier_per_channel =
659       output_multiplier_per_channel.data();
660   op_params.output_shift_per_channel =
661       output_shift_per_channel.data();  // Negated wrt output_shift.
662 
663   ReferenceRunner<quantization_type>::Run(
664       test_param, op_params, input_data, input_shape, filter_data, filter_shape,
665       bias_data, bias_shape, output_shape, reference_output_data.data());
666 
667   DispatchDepthwiseConv<quantization_type>(
668       test_param, op_params, input_shape, input_data, filter_shape, filter_data,
669       bias_shape, bias_data, output_shape, output_data.data());
670   int saturated_min = 0;
671   int saturated_max = 0;
672   std::vector<int> diff(output_buffer_size);
673   std::int64_t sum_diff = 0;
674   std::int64_t sum_abs_diff = 0;
675   for (int i = 0; i < output_buffer_size; i++) {
676     diff[i] = static_cast<int>(output_data[i]) -
677               static_cast<int>(reference_output_data[i]);
678     sum_diff += diff[i];
679     sum_abs_diff += std::abs(diff[i]);
680     saturated_min += output_data[i] == output_activation_min;
681     saturated_max += output_data[i] == output_activation_max;
682   }
683   // These stats help understand test failures.
684   std::sort(std::begin(diff), std::end(diff));
685   const int min_diff = diff.front();
686   const int max_diff = diff.back();
687   const int median_diff = diff[diff.size() / 2];
688   const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
689   const float mean_abs_diff =
690       static_cast<float>(sum_abs_diff) / output_buffer_size;
691 
692   int diff_mean_tolerance = 1;
693   int diff_median_tolerance = 0;
694   // The tolerance that we apply to means is tight, but we allow for a rounding
695   // difference in one pixel, and loosen by another 1% for float comparison.
696   float mean_tolerance = std::max(
697       1e-5f, 1.01f / output_buffer_size * std::sqrt(1.f * depth_multiplier));
698   if (test_param.loose_tolerance) {
699     mean_tolerance = 500.f;
700     diff_mean_tolerance = 256;
701     diff_median_tolerance = 225;
702   }
703 
704   // Normally we should require bit-for-bit exact results. Unfortunately a bug
705   // in the Intel arm_neon_sse.h translation header that we use for x86 tests
706   // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
707   // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
708   // few off-by-one errors for now, yet still ensure that no more than a small
709   // minority of values are wrong.
710   EXPECT_LT(std::abs(mean_diff), mean_tolerance);
711   EXPECT_LT(mean_abs_diff, mean_tolerance);
712   EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
713   EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
714   EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
715   EXPECT_TRUE(std::abs(mean_diff) < mean_tolerance &&
716               mean_abs_diff < mean_tolerance &&
717               std::abs(median_diff) <= diff_median_tolerance &&
718               std::abs(min_diff) <= diff_mean_tolerance &&
719               std::abs(max_diff) <= diff_mean_tolerance)
720       << "pad_width = " << op_params.padding_values.width
721       << " pad_height = " << op_params.padding_values.height
722       << " input_width = " << input_shape.Dims(2)
723       << " input_height = " << input_shape.Dims(1)
724       << " output_width = " << output_shape.Dims(2)
725       << " output_height = " << output_shape.Dims(1)
726       << " depth = " << input_shape.Dims(3)
727       << " output_offset = " << op_params.output_offset
728       << " output_multiplier = " << op_params.output_multiplier
729       << " output_shift = " << op_params.output_shift;
730 
731   if (saturated_min > 2 * saturated_max) {
732     return -1;
733   }
734   if (saturated_max > 2 * saturated_min) {
735     return 1;
736   }
737   return 0;
738 }
739 
740 // The point of this function is that we can't practically know which
741 // output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
742 // could do some statistics for large size, but they would be fragile at smaller
743 // sizes), and guessing wrong would mean that all the values get saturated so
744 // the test becomes vacuous. So we just bisect our way to reasonable
745 // output_shift values.
746 template <QuantizationType quantization_type>
TestOneDepthwiseConvBisectOutputShift(const TestParam & test_param,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & input_shape,std::int32_t input_offset,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & filter_shape,std::int32_t filter_offset,const std::int32_t * bias_data,const RuntimeShape & bias_shape,int stride,PaddingType padding_type,int pad_width,int pad_height,int depth_multiplier,std::int32_t output_offset,std::int32_t output_multiplier,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,int output_activation_bisect_start,int output_activation_bisect_end,std::int32_t output_activation_min,std::int32_t output_activation_max,const RuntimeShape & output_shape)747 void TestOneDepthwiseConvBisectOutputShift(
748     const TestParam& test_param,
749     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
750         input_data,
751     const RuntimeShape& input_shape, std::int32_t input_offset,
752     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
753         filter_data,
754     const RuntimeShape& filter_shape, std::int32_t filter_offset,
755     const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
756     PaddingType padding_type, int pad_width, int pad_height,
757     int depth_multiplier, std::int32_t output_offset,
758     std::int32_t output_multiplier, const std::int32_t* output_shift_adjust,
759     const std::int32_t* output_multiplier_adjust,
760     int output_activation_bisect_start, int output_activation_bisect_end,
761     std::int32_t output_activation_min, std::int32_t output_activation_max,
762     const RuntimeShape& output_shape) {
763   ASSERT_LT(output_activation_bisect_start, output_activation_bisect_end)
764       << "Bisection failed ?!?!";
765   int output_shift_bisect_midpoint =
766       (output_activation_bisect_start + output_activation_bisect_end) / 2;
767   int bisect_result =
768       TestOneDepthwiseConvWithGivenOutputShift<quantization_type>(
769           test_param, input_data, input_shape, input_offset, filter_data,
770           filter_shape, filter_offset, bias_data, bias_shape, stride,
771           padding_type, pad_width, pad_height, depth_multiplier, output_offset,
772           output_multiplier, output_shift_adjust, output_multiplier_adjust,
773           output_shift_bisect_midpoint, output_activation_min,
774           output_activation_max, output_shape);
775   // At this point we know that the test succeeded (otherwise it would have
776   // aborted).
777   if (bisect_result == 0) {
778     // The result isn't particularly saturated on one or the other side.
779     // All good, we're done.
780     return;
781   }
782   if (output_activation_bisect_start == output_activation_bisect_end - 1) {
783     // There is still some saturation on one side, but the bisection is
784     // finished anyways. We're done; nothing more we can do about it. This
785     // happens
786     // in particular when using an activation with a narrow range.
787     return;
788   }
789   // Continue the bisection based on the present result.
790   int new_output_activation_bisect_start = bisect_result == 1
791                                                ? output_shift_bisect_midpoint
792                                                : output_activation_bisect_start;
793   int new_output_activation_bisect_end = bisect_result == 1
794                                              ? output_activation_bisect_end
795                                              : output_shift_bisect_midpoint;
796   TestOneDepthwiseConvBisectOutputShift<quantization_type>(
797       test_param, input_data, input_shape, input_offset, filter_data,
798       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
799       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
800       output_shift_adjust, output_multiplier_adjust,
801       new_output_activation_bisect_start, new_output_activation_bisect_end,
802       output_activation_min, output_activation_max, output_shape);
803 }
804 
805 template <QuantizationType quantization_type>
TestOneDepthwiseConv(const TestParam & test_param,const typename QuantizationTypeImpl<quantization_type>::ExternalType * input_data,const RuntimeShape & input_shape,std::int32_t input_offset,const typename QuantizationTypeImpl<quantization_type>::ExternalType * filter_data,const RuntimeShape & filter_shape,std::int32_t filter_offset,const std::int32_t * bias_data,const RuntimeShape & bias_shape,int stride,PaddingType padding_type,int pad_width,int pad_height,int depth_multiplier,std::int32_t output_offset,std::int32_t output_multiplier,const std::int32_t * output_shift_adjust,const std::int32_t * output_multiplier_adjust,std::int32_t output_activation_min,std::int32_t output_activation_max,const RuntimeShape & output_shape)806 void TestOneDepthwiseConv(
807     const TestParam& test_param,
808     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
809         input_data,
810     const RuntimeShape& input_shape, std::int32_t input_offset,
811     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
812         filter_data,
813     const RuntimeShape& filter_shape, std::int32_t filter_offset,
814     const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
815     PaddingType padding_type, int pad_width, int pad_height,
816     int depth_multiplier, std::int32_t output_offset,
817     std::int32_t output_multiplier, const std::int32_t* output_shift_adjust,
818     const std::int32_t* output_multiplier_adjust,
819     std::int32_t output_activation_min, std::int32_t output_activation_max,
820     const RuntimeShape& output_shape) {
821   TestOneDepthwiseConvBisectOutputShift<quantization_type>(
822       test_param, input_data, input_shape, input_offset, filter_data,
823       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
824       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
825       output_shift_adjust, output_multiplier_adjust, 0, 32,
826       output_activation_min, output_activation_max, output_shape);
827 }
828 
TryTestDepthwiseConv(const TestParam & test_param,ParamsSpecialization params_specialization,int batch,int input_depth,int input_width,int input_height,int filter_width,int filter_height,int depth_multiplier,int stride,int dilation_width_factor,int dilation_height_factor,PaddingType padding_type)829 bool TryTestDepthwiseConv(const TestParam& test_param,
830                           ParamsSpecialization params_specialization, int batch,
831                           int input_depth, int input_width, int input_height,
832                           int filter_width, int filter_height,
833                           int depth_multiplier, int stride,
834                           int dilation_width_factor, int dilation_height_factor,
835                           PaddingType padding_type) {
836   const int output_depth = input_depth * depth_multiplier;
837   // The optimized DepthwiseConv implementation currently uses a fixed-size
838   // accumulator buffer on the stack, with that size. This currently means
839   // that it does not support larger output depths. It CHECK's for it,
840   // so it's safe in the sense that if a larger output depth was encountered,
841   // it would explicitly fail. We just need to adjust our testing to that
842   // constraint.
843   const int kMaxSupportedOutputDepth = 1024;
844   if (output_depth > kMaxSupportedOutputDepth) {
845     return false;
846   }
847 
848   int output_activation_min;
849   int output_activation_max;
850   std::int32_t output_multiplier;
851   std::int32_t input_offset;
852   std::int32_t output_offset;
853 
854   if (test_param.quantization_type == QuantizationType::kNonPerChannelUint8) {
855     output_activation_min = 0;
856     output_activation_max = 255;
857     if (UniformRandomInt(0, 1)) {
858       output_activation_min = UniformRandomInt(0, 50);
859       output_activation_max = UniformRandomInt(200, 255);
860     }
861     output_multiplier =
862         UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
863     input_offset = UniformRandomInt(-255, 0);
864     output_offset = UniformRandomInt(0, 255);
865   } else {
866     output_activation_min = -127;
867     output_activation_max = 127;
868     if (UniformRandomInt(0, 1)) {
869       output_activation_min = UniformRandomInt(-127, -75);
870       output_activation_max = UniformRandomInt(75, 127);
871     }
872     output_multiplier =
873         UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
874     input_offset = UniformRandomInt(-127, 127);
875     output_offset = UniformRandomInt(-127, 127);
876   }
877 
878   RuntimeShape input_shape_inference(
879       {batch, input_height, input_width, input_depth});
880   RuntimeShape output_shape_inference;
881   int pad_width, pad_height;
882   if (!ComputeConvSizes(input_shape_inference, output_depth, filter_width,
883                         filter_height, stride, dilation_width_factor,
884                         dilation_height_factor, padding_type,
885                         &output_shape_inference, &pad_width, &pad_height)) {
886     return false;
887   }
888   TFLITE_DCHECK_EQ(output_depth, output_shape_inference.Dims(3));
889 
890   RuntimeShape filter_shape_inference(
891       {1, filter_height, filter_width, output_depth});
892   RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
893   const int input_buffer_size = input_shape_inference.FlatSize();
894   const int filter_buffer_size = filter_shape_inference.FlatSize();
895   std::vector<std::int32_t> bias_data(output_depth);
896   FillRandom(&bias_data, -10000, 10000);
897 
898   if (test_param.quantization_type == QuantizationType::kPerChannelInt8) {
899     std::vector<std::int8_t> input_data(input_buffer_size);
900     std::vector<std::int8_t> filter_data(filter_buffer_size);
901     FillRandom(&input_data, static_cast<int8>(-127), static_cast<int8>(127));
902     FillRandom(&filter_data, static_cast<int8>(-127), static_cast<int8>(127));
903 
904     std::int32_t filter_offset = 0;
905     EXPECT_TRUE(params_specialization == ParamsSpecialization::kSymmetric);
906 
907     std::vector<std::int32_t> output_multiplier_adjust(output_depth, 0);
908     std::vector<std::int32_t> output_shift_adjust(output_depth, 0);
909     for (int i = 0; i < output_depth; ++i) {
910       // Thus a good way to randomize multipliers is to subtract from them
911       // a random value smaller than 2^30 but still significant compared to
912       // it.
913       FillRandom(&output_multiplier_adjust, -(1 << 26), 0);
914       FillRandom(&output_shift_adjust, -4, 0);
915     }
916     TestOneDepthwiseConv<QuantizationType::kPerChannelInt8>(
917         test_param, input_data.data(), input_shape_inference, input_offset,
918         filter_data.data(), filter_shape_inference, filter_offset,
919         bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
920         pad_height, depth_multiplier, output_offset, output_multiplier,
921         output_shift_adjust.data(), output_multiplier_adjust.data(),
922         output_activation_min, output_activation_max, output_shape_inference);
923   } else {
924     std::vector<std::uint8_t> input_data(input_buffer_size);
925     std::vector<std::uint8_t> filter_data(filter_buffer_size);
926     FillRandom(&input_data);
927     FillRandom(&filter_data);
928 
929     std::int32_t filter_offset = -kSymmetricZeroPoint;
930     if (params_specialization != ParamsSpecialization::kSymmetric) {
931       filter_offset = UniformRandomInt(-255, 0);
932     }
933 
934     TestOneDepthwiseConv<QuantizationType::kNonPerChannelUint8>(
935         test_param, input_data.data(), input_shape_inference, input_offset,
936         filter_data.data(), filter_shape_inference, filter_offset,
937         bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
938         pad_height, depth_multiplier, output_offset, output_multiplier,
939         nullptr /*=output_shift_adjust*/, nullptr /*=output_multiplier_adjust*/,
940         output_activation_min, output_activation_max, output_shape_inference);
941   }
942 
943   return true;
944 }
945 
946 // This function picks some random DepthwiseConv params, which may or may not
947 // be legal. If they're not legal, it returns false. If they're legal,
948 // it runs the DepthwiseConv test and returns true. This allows the caller
949 // to loop until a test has been run.
TryTestOneDepthwiseConv(const TestParam & test_param,ParamsSpecialization params_specialization)950 bool TryTestOneDepthwiseConv(const TestParam& test_param,
951                              ParamsSpecialization params_specialization) {
952   // We have to pick a lot of positive values, where we are particularly
953   // interested in small values because they are most likely to be special
954   // cases in optimized implementations, and secondarily because they allow
955   // tests to run fast, which means we can run more tests and get more
956   // coverage.
957   const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
958   const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
959   const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
960   const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
961   const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10);
962   const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10);
963   const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50);
964   const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
965   const int dilation_width_factor = RandomElement(std::vector<int>({1, 2, 4}));
966   const int dilation_height_factor = RandomElement(std::vector<int>({1, 2, 4}));
967   const auto padding_type =
968       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
969 
970   return TryTestDepthwiseConv(
971       test_param, params_specialization, batch, input_depth, input_width,
972       input_height, filter_width, filter_height, depth_multiplier, stride,
973       dilation_width_factor, dilation_height_factor, padding_type);
974 }
975 
976 // Tests parameters for the 3x3 filter kernel.
TryTestOneDepthwiseConv3x3Filter(const TestParam & test_param,ParamsSpecialization params_specialization)977 bool TryTestOneDepthwiseConv3x3Filter(
978     const TestParam& test_param, ParamsSpecialization params_specialization) {
979   const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
980   const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
981   int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
982   int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
983   const int filter_width = 3;
984   const int filter_height = 3;
985   const int depth_multiplier = 1;
986   const int stride = UniformRandomInt(1, 2);
987   // We don't support dilations in the 3x3 filter.
988   const int dilation_width_factor = 1;
989   const int dilation_height_factor = 1;
990   const auto padding_type =
991       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
992 
993   // Adjust for, or reject, special cases.
994   if (test_param.forced_invocation != DepthwiseConvImplementation::kNone) {
995     // With stride == 2 and SAME, padding width and height are the left and top
996     // padding amounts. When there is an even input dimension, padding + 1 is
997     // required on the right / bottom. This is not handled by these kernels, so
998     // we bump the input dimensions.
999     if (padding_type == PaddingType::kSame && stride == 2) {
1000       input_width = 2 * (input_width / 2) + 1;
1001       input_height = 2 * (input_height / 2) + 1;
1002     }
1003 
1004     // The padded 3x3 kernel (with kSame) does not support input_width == 1 when
1005     // input_height > 1, and vice versa.
1006     if (padding_type == PaddingType::kSame &&
1007         (input_width > 1) != (input_height > 1)) {
1008       return false;
1009     }
1010   }
1011 
1012   return TryTestDepthwiseConv(
1013       test_param, params_specialization, batch, input_depth, input_width,
1014       input_height, filter_width, filter_height, depth_multiplier, stride,
1015       dilation_width_factor, dilation_height_factor, padding_type);
1016 }
1017 
1018 // Tests with parameters suited to dot-product-NEON 3x3 filter kernels.
TryTestOneNeonDot3x3(const TestParam & test_param,ParamsSpecialization params_specialization)1019 bool TryTestOneNeonDot3x3(const TestParam& test_param,
1020                           ParamsSpecialization params_specialization) {
1021   const CoverageExtension coverage_extension = static_cast<CoverageExtension>(
1022       UniformRandomInt(0, static_cast<int>(CoverageExtension::kNumOptions)));
1023 
1024   const int batch = 1;
1025   const int input_depth = test_param.test_depth_multiplier
1026                               ? 1
1027                               : 8 * ExponentialRandomPositiveInt(0.9f, 3, 50);
1028   const int input_width = coverage_extension == CoverageExtension::kLargeWidths
1029                               ? ExponentialRandomPositiveInt(0.9f, 50, 200)
1030                               : ExponentialRandomPositiveInt(0.9f, 20, 60);
1031   const int input_height =
1032       coverage_extension == CoverageExtension::kLargeHeights
1033           ? ExponentialRandomPositiveInt(0.9f, 50, 200)
1034           : ExponentialRandomPositiveInt(0.9f, 20, 60);
1035   const int filter_width = 3;
1036   const int filter_height = 3;
1037   const int depth_multiplier =
1038       test_param.test_depth_multiplier
1039           ? 8 * ExponentialRandomPositiveInt(0.2f, 1, 9)
1040           : 1;
1041   const int stride = test_param.test_stride ? 2 : 1;
1042   // We don't support dilations in the 3x3 filter.
1043   const int dilation_width_factor = 1;
1044   const int dilation_height_factor = 1;
1045   const auto padding_type =
1046       test_param.test_pad ? PaddingType::kSame : PaddingType::kValid;
1047 
1048   return TryTestDepthwiseConv(
1049       test_param, params_specialization, batch, input_depth, input_width,
1050       input_height, filter_width, filter_height, depth_multiplier, stride,
1051       dilation_width_factor, dilation_height_factor, padding_type);
1052 }
1053 
TestOneDepthwiseConv(DepthwiseConvImplementation forced_invocation,DepthwiseConvOutputRounding output_rounding)1054 void TestOneDepthwiseConv(DepthwiseConvImplementation forced_invocation,
1055                           DepthwiseConvOutputRounding output_rounding) {
1056   TestParam test_param;
1057   test_param.forced_invocation = forced_invocation;
1058   test_param.output_rounding = output_rounding;
1059   while (!TryTestOneDepthwiseConv(test_param, ParamsSpecialization::kNone)) {
1060   }
1061 }
1062 
TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation forced_invocation,DepthwiseConvOutputRounding output_rounding)1063 void TestOneDepthwiseConv3x3Filter(
1064     DepthwiseConvImplementation forced_invocation,
1065     DepthwiseConvOutputRounding output_rounding) {
1066   TestParam test_param;
1067   test_param.forced_invocation = forced_invocation;
1068   test_param.output_rounding = output_rounding;
1069   while (!TryTestOneDepthwiseConv3x3Filter(test_param,
1070                                            ParamsSpecialization::kNone)) {
1071   }
1072 }
1073 
TestOneNeonDot3x3(const TestParam & test_param)1074 void TestOneNeonDot3x3(const TestParam& test_param) {
1075 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
1076     defined(__clang__)
1077   CpuFlags cpu_flags;
1078   GetCpuFlags(&cpu_flags);
1079   const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
1080   if (test_param.forced_invocation ==
1081           DepthwiseConvImplementation::kUseNeon3x3DotProduct &&
1082       !has_dot_product_instructions) {
1083     return;
1084   }
1085 #endif
1086 
1087   while (!TryTestOneNeonDot3x3(test_param, ParamsSpecialization::kSymmetric)) {
1088   }
1089 }
1090 
TEST(TestDepthwiseConv,TestDepthwiseConv)1091 TEST(TestDepthwiseConv, TestDepthwiseConv) {
1092   const int kTestsToRun = 1000;
1093   for (int i = 0; i < kTestsToRun; i++) {
1094     TestOneDepthwiseConv(DepthwiseConvImplementation::kNone,
1095                          DepthwiseConvOutputRounding::kAwayFromZero);
1096   }
1097 }
1098 
1099 // Run basic coverage test against the generic kernel.
TEST(TestDepthwiseConv,TestGenericKernel)1100 TEST(TestDepthwiseConv, TestGenericKernel) {
1101   const int kTestsToRun = 1000;
1102   for (int i = 0; i < kTestsToRun; i++) {
1103     TestOneDepthwiseConv(DepthwiseConvImplementation::kUseGenericKernel,
1104                          DepthwiseConvOutputRounding::kAwayFromZero);
1105   }
1106 }
1107 
1108 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
TEST(TestDepthwiseConv,TestNeon3x3FilterAway)1109 TEST(TestDepthwiseConv, TestNeon3x3FilterAway) {
1110   const int kTestsToRun = 500;
1111   for (int i = 0; i < kTestsToRun; i++) {
1112     TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
1113                                   DepthwiseConvOutputRounding::kAwayFromZero);
1114   }
1115 }
1116 
TEST(TestDepthwiseConv,TestNeon3x3FilterUpward)1117 TEST(TestDepthwiseConv, TestNeon3x3FilterUpward) {
1118   const int kTestsToRun = 500;
1119   for (int i = 0; i < kTestsToRun; i++) {
1120     TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
1121                                   DepthwiseConvOutputRounding::kUpward);
1122   }
1123 }
1124 #endif
1125 
1126 // While 3x3 coverage tests are primarily targeted at specialized kernels, we
1127 // also run it against the generic kernel.
TEST(TestDepthwiseConv,TestGenericKernel3x3Filter)1128 TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
1129   const int kTestsToRun = 100;
1130   for (int i = 0; i < kTestsToRun; i++) {
1131     TestOneDepthwiseConv3x3Filter(
1132         DepthwiseConvImplementation::kUseGenericKernel,
1133         DepthwiseConvOutputRounding::kAwayFromZero);
1134   }
1135 }
1136 
1137 class DepthwiseConvTest : public ::testing::TestWithParam<TestParamTuple> {};
1138 
TEST_P(DepthwiseConvTest,NeonDot3x3)1139 TEST_P(DepthwiseConvTest, NeonDot3x3) {
1140   const TestParam param(GetParam());
1141   for (int i = 0; i < param.tests_to_run; i++) {
1142     TestOneNeonDot3x3(param);
1143   }
1144 }
1145 
1146 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
1147 INSTANTIATE_TEST_SUITE_P(
1148     Neon3x3KernelAway, DepthwiseConvTest,
1149     testing::Combine(
1150         Values(DepthwiseConvImplementation::kUseNeon3x3),  // forced_invocation
1151         Values(500),                                       // tests_to_run
1152         Values(QuantizationType::kNonPerChannelUint8),     // quantization_type
1153         Bool(),                                            // test_stride
1154         Values(false),                                     // test_pad
1155         Values(false),  // test_depth_multiplier
1156         Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
1157         Values(1),                                           // num_threads
1158         Values(false)                                        // loose_tolerance
1159         ),
1160     TestParam::TestNameSuffix);
1161 
1162 INSTANTIATE_TEST_SUITE_P(
1163     Neon3x3KernelUpward, DepthwiseConvTest,
1164     testing::Combine(
1165         Values(DepthwiseConvImplementation::kUseNeon3x3),  // forced_invocation
1166         Values(500),                                       // tests_to_run
1167         Values(QuantizationType::kNonPerChannelUint8),     // quantization_type
1168         Bool(),                                            // test_stride
1169         Values(false),                                     // test_pad
1170         Values(false),                                 // test_depth_multiplier
1171         Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
1172         Values(1),                                     // num_threads
1173         Values(false)                                  // loose_tolerance
1174         ),
1175     TestParam::TestNameSuffix);
1176 
1177 INSTANTIATE_TEST_SUITE_P(
1178     Neon3x3KernelUpwardPerChannel, DepthwiseConvTest,
1179     testing::Combine(
1180         Values(DepthwiseConvImplementation::kUseNeon3x3),  // forced_invocation
1181         Values(500),                                       // tests_to_run
1182         Values(QuantizationType::kPerChannelInt8),         // quantization_type
1183         Bool(),                                            // test_stride
1184         Values(false),                                     // test_pad
1185         Values(false),                                 // test_depth_multiplier
1186         Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
1187         Values(1),                                     // num_threads
1188         Values(false)                                  // loose_tolerance
1189         ),
1190     TestParam::TestNameSuffix);
1191 #endif  // __aarch64__ && !GOOGLE_L4T
1192 
1193 // While 3x3 coverage tests are primarily targeted at specialized kernels, we
1194 // also run it against the generic kernel.
1195 INSTANTIATE_TEST_SUITE_P(
1196     GenericKernel, DepthwiseConvTest,
1197     testing::Combine(
1198         Values(DepthwiseConvImplementation::
1199                    kUseGenericKernel),                  // forced_invocation
1200         Values(100),                                    // tests_to_run
1201         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
1202         Bool(),                                         // test_stride
1203         Bool(),                                         // test_pad
1204         Bool(),                                         // test_depth_multiplier
1205         Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
1206         Values(1),                                           // num_threads
1207         Values(false)                                        // loose_tolerance
1208         ),
1209     TestParam::TestNameSuffix);
1210 
1211 INSTANTIATE_TEST_SUITE_P(
1212     GenericKernelPerChannel, DepthwiseConvTest,
1213     testing::Combine(
1214         Values(DepthwiseConvImplementation::
1215                    kUseGenericKernel),              // forced_invocation
1216         Values(100),                                // tests_to_run
1217         Values(QuantizationType::kPerChannelInt8),  // quantization_type
1218         Bool(),                                     // test_stride
1219         Bool(),                                     // test_pad
1220         Bool(),                                     // test_depth_multiplier
1221         Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
1222         Values(1),                                           // num_threads
1223         Values(false)                                        // loose_tolerance
1224         ),
1225     TestParam::TestNameSuffix);
1226 
1227 INSTANTIATE_TEST_SUITE_P(
1228     CModel, DepthwiseConvTest,
1229     testing::Combine(
1230         Values(DepthwiseConvImplementation::
1231                    kUseCModel3x3DotProduct),            // forced_invocation
1232         Values(200),                                    // tests_to_run
1233         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
1234         Bool(),                                         // test_stride
1235         Bool(),                                         // test_pad
1236         Bool(),                                         // test_depth_multiplier
1237         Values(DepthwiseConvOutputRounding::kUpward),   // output_rounding
1238         Values(1),                                      // num_threads
1239         Values(false)                                   // loose_tolerance
1240         ),
1241     TestParam::TestNameSuffix);
1242 
1243 INSTANTIATE_TEST_SUITE_P(
1244     Unwound, DepthwiseConvTest,
1245     testing::Combine(
1246         Values(DepthwiseConvImplementation::
1247                    kUseUnwound3x3DotProduct),           // forced_invocation
1248         Values(200),                                    // tests_to_run
1249         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
1250         Bool(),                                         // test_stride
1251         Bool(),                                         // test_pad
1252         Bool(),                                         // test_depth_multiplier
1253         Values(DepthwiseConvOutputRounding::kUpward),   // output_rounding
1254         Values(1),                                      // num_threads
1255         Values(false)                                   // loose_tolerance
1256         ),
1257     TestParam::TestNameSuffix);
1258 
1259 #if defined(USE_NEON)
1260 
1261 // TODO(b/148145875): Remove this extra guard after checking that code runs
1262 // without lax vector conversions.
1263 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
1264     defined(__clang__)
1265 INSTANTIATE_TEST_SUITE_P(
1266     IntrinsicsPerChannel, DepthwiseConvTest,
1267     testing::Combine(
1268         Values(DepthwiseConvImplementation::
1269                    kUseIntrinsics3x3DotProduct),       // forced_invocation
1270         Values(200),                                   // tests_to_run
1271         Values(QuantizationType::kPerChannelInt8),     // quantization_type
1272         Bool(),                                        // test_stride
1273         Bool(),                                        // test_pad
1274         Bool(),                                        // test_depth_multiplier
1275         Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
1276         Values(1),                                     // num_threads
1277         Values(kLooseIntrinsicsTolerance)              // loose_tolerance
1278         ),
1279     TestParam::TestNameSuffix);
1280 #endif
1281 
1282 #endif
1283 
1284 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
1285     defined(__clang__)
1286 INSTANTIATE_TEST_SUITE_P(
1287     NeonAsm, DepthwiseConvTest,
1288     testing::Combine(
1289         Values(DepthwiseConvImplementation::
1290                    kUseNeon3x3DotProduct),              // forced_invocation
1291         Values(200),                                    // tests_to_run
1292         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
1293         Bool(),                                         // test_stride
1294         Bool(),                                         // test_pad
1295         Bool(),                                         // test_depth_multiplier
1296         Values(DepthwiseConvOutputRounding::kUpward),   // output_rounding
1297         Values(1),                                      // num_threads
1298         Values(false)                                   // loose_tolerance
1299         ),
1300     TestParam::TestNameSuffix);
1301 
1302 INSTANTIATE_TEST_SUITE_P(
1303     NeonAsmPerChannel, DepthwiseConvTest,
1304     testing::Combine(
1305         Values(DepthwiseConvImplementation::
1306                    kUseNeon3x3DotProduct),             // forced_invocation
1307         Values(200),                                   // tests_to_run
1308         Values(QuantizationType::kPerChannelInt8),     // quantization_type
1309         Bool(),                                        // test_stride
1310         Bool(),                                        // test_pad
1311         Bool(),                                        // test_depth_multiplier
1312         Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
1313         Values(1),                                     // num_threads
1314         Values(false)                                  // loose_tolerance
1315         ),
1316     TestParam::TestNameSuffix);
1317 
1318 // Apply the 3x3 tests through the dispatch.
1319 // Also test multi-threading. This assumes upward rounding.
1320 INSTANTIATE_TEST_SUITE_P(
1321     Dispatch3x3, DepthwiseConvTest,
1322     testing::Combine(
1323         Values(DepthwiseConvImplementation::kNone),     // forced_invocation
1324         Values(200),                                    // tests_to_run
1325         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
1326         Bool(),                                         // test_stride
1327         Bool(),                                         // test_pad
1328         Bool(),                                         // test_depth_multiplier
1329         Values(DepthwiseConvOutputRounding::kUpward),   // output_rounding
1330         Values(4),                                      // num_threads
1331         Values(false)                                   // loose_tolerance
1332         ),
1333     TestParam::TestNameSuffix);
1334 
1335 INSTANTIATE_TEST_SUITE_P(
1336     Dispatch3x3PerChannel, DepthwiseConvTest,
1337     testing::Combine(
1338         Values(DepthwiseConvImplementation::kNone),    // forced_invocation
1339         Values(200),                                   // tests_to_run
1340         Values(QuantizationType::kPerChannelInt8),     // quantization_type
1341         Bool(),                                        // test_stride
1342         Bool(),                                        // test_pad
1343         Bool(),                                        // test_depth_multiplier
1344         Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
1345         Values(4),                                     // num_threads
1346         Values(false)                                  // loose_tolerance
1347         ),
1348     TestParam::TestNameSuffix);
1349 #endif
1350 
1351 }  // namespace
1352 }  // namespace tflite
1353