• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <sys/types.h>
16 
17 #include <algorithm>
18 #include <cmath>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 #include <type_traits>
25 #include <vector>
26 
27 #include <gtest/gtest.h>
28 #include "tensorflow/lite/kernels/internal/common.h"
29 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
30 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
31 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
32 #include "tensorflow/lite/kernels/internal/quantization_util.h"
33 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
34 #include "tensorflow/lite/kernels/internal/test_util.h"
35 #include "tensorflow/lite/kernels/internal/types.h"
36 
37 namespace tflite {
38 namespace {
39 
PickOutputMultiplier(const DepthwiseParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,float * output_multiplier)40 void PickOutputMultiplier(
41     const DepthwiseParams& params, const RuntimeShape& input_shape,
42     const int8* input_data, const RuntimeShape& filter_shape,
43     const int8* filter_data, const RuntimeShape& bias_shape,
44     const int32* bias_data, const RuntimeShape& output_shape,
45     float* output_multiplier) {
46   const int stride_width = params.stride_width;
47   const int stride_height = params.stride_height;
48   const int dilation_width_factor = params.dilation_width_factor;
49   const int dilation_height_factor = params.dilation_height_factor;
50   const int pad_width = params.padding_values.width;
51   const int pad_height = params.padding_values.height;
52   const int depth_multiplier = params.depth_multiplier;
53   const int32 input_offset = params.input_offset;
54 
55   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
56   const int input_height = input_shape.Dims(1);
57   const int input_width = input_shape.Dims(2);
58   const int input_depth = input_shape.Dims(3);
59   const int filter_height = filter_shape.Dims(1);
60   const int filter_width = filter_shape.Dims(2);
61   const int output_height = output_shape.Dims(1);
62   const int output_width = output_shape.Dims(2);
63 
64   int output_accu_min = std::numeric_limits<std::int32_t>::max();
65   int output_accu_max = std::numeric_limits<std::int32_t>::min();
66 
67   for (int batch = 0; batch < batches; ++batch) {
68     for (int out_y = 0; out_y < output_height; ++out_y) {
69       for (int out_x = 0; out_x < output_width; ++out_x) {
70         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
71           for (int m = 0; m < depth_multiplier; ++m) {
72             const int output_channel = m + in_channel * depth_multiplier;
73             const int in_x_origin = (out_x * stride_width) - pad_width;
74             const int in_y_origin = (out_y * stride_height) - pad_height;
75             int32 acc = 0;
76             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
77               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
78                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
79                 const int in_y =
80                     in_y_origin + dilation_height_factor * filter_y;
81                 // Zero padding by omitting the areas outside the image.
82                 const bool is_point_inside_image =
83                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
84                     (in_y < input_height);
85                 if (is_point_inside_image) {
86                   int32 input_val = input_data[Offset(input_shape, batch, in_y,
87                                                       in_x, in_channel)];
88                   int32 filter_val = filter_data[Offset(
89                       filter_shape, 0, filter_y, filter_x, output_channel)];
90                   acc += filter_val * (input_val + input_offset);
91                 }
92               }
93             }
94             if (bias_data) {
95               acc += bias_data[output_channel];
96             }
97             output_accu_max = std::max(acc, output_accu_max);
98             output_accu_min = std::min(acc, output_accu_min);
99           }
100         }
101       }
102     }
103   }
104 
105   // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
106   // min/max fit in those ranges correspondingly as much as possible.
107   if (std::abs(output_accu_max) > std::abs(output_accu_min)) {
108     *output_multiplier = 127.0f / std::abs(output_accu_max);
109   } else {
110     *output_multiplier = 128.0f / std::abs(output_accu_min);
111   }
112 }
113 
PickReasonableMultiplier(const DepthwiseParams & params,int output_activation_min,int output_activation_max,int output_depth,const RuntimeShape & input_shape_inference,const std::int8_t * input_data,const RuntimeShape & filter_shape_inference,const std::int8_t * filter_data,const RuntimeShape & bias_shape_inference,const std::int32_t * bias_data,const RuntimeShape & output_shape_inference,std::int32_t * output_multiplier_ptr,std::int32_t * output_shift_ptr,std::int8_t * output_data)114 void PickReasonableMultiplier(
115     const DepthwiseParams& params, int output_activation_min,
116     int output_activation_max, int output_depth,
117     const RuntimeShape& input_shape_inference, const std::int8_t* input_data,
118     const RuntimeShape& filter_shape_inference, const std::int8_t* filter_data,
119     const RuntimeShape& bias_shape_inference, const std::int32_t* bias_data,
120     const RuntimeShape& output_shape_inference,
121     std::int32_t* output_multiplier_ptr, std::int32_t* output_shift_ptr,
122     std::int8_t* output_data) {
123   float output_multiplier;
124   PickOutputMultiplier(params, input_shape_inference, input_data,
125                        filter_shape_inference, filter_data,
126                        bias_shape_inference, bias_data, output_shape_inference,
127                        &output_multiplier);
128 
129   int base_multiplier;
130   int base_shift;
131   QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
132   for (int i = 0; i < output_depth; ++i) {
133     // multipliers typically range in [2^30 ; 2^31 - 1].
134     // Values in [0, 2^30 - 1] are normally unused, but harmless.
135     // Thus a good way to randomize multipliers is to subtract from them
136     // a random value smaller than 2^30 but still significant compared to it.
137     output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
138     output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
139   }
140 }
141 
142 // The reference implementation & the fast kernel have different rounding
143 // mechanism, so we loosely compare the difference.
CompareRoundingResults(int flat_size,const int depth_multiplier,const std::int8_t * reference_result,const std::int8_t * fast_kernel_result)144 void CompareRoundingResults(int flat_size, const int depth_multiplier,
145                             const std::int8_t* reference_result,
146                             const std::int8_t* fast_kernel_result) {
147   std::vector<int> diff(flat_size);
148   std::int64_t sum_diff = 0;
149   std::int64_t sum_abs_diff = 0;
150   for (int i = 0; i < flat_size; i++) {
151     diff[i] = static_cast<int>(fast_kernel_result[i]) -
152               static_cast<int>(reference_result[i]);
153     sum_diff += diff[i];
154     sum_abs_diff += std::abs(diff[i]);
155   }
156   // These stats help understand test failures.
157   std::sort(std::begin(diff), std::end(diff));
158   const int min_diff = diff.front();
159   const int max_diff = diff.back();
160   const int median_diff = diff[diff.size() / 2];
161   const float mean_diff = static_cast<float>(sum_diff) / flat_size;
162   const float mean_abs_diff = static_cast<float>(sum_abs_diff) / flat_size;
163 
164   // The tolerance that we apply to means is tight, but we allow for a rounding
165   // difference in one pixel, and loosen by another 1% for float comparison.
166   const float mean_tolerance =
167       std::max(1e-2f, 1.01f / flat_size * std::sqrt(1.f * depth_multiplier));
168   const int diff_mean_tolerance = 256;
169   const int diff_median_tolerance = 225;
170 
171   // Normally we should require bit-for-bit exact results. Unfortunately a bug
172   // in the Intel arm_neon_sse.h translation header that we use for x86 tests
173   // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
174   // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
175   // few off-by-one errors for now, yet still ensure that no more than a small
176   // minority of values are wrong.
177   EXPECT_LT(std::abs(mean_diff), mean_tolerance);
178   EXPECT_LT(mean_abs_diff, mean_tolerance);
179   EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
180   EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
181   EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
182   EXPECT_TRUE(std::abs(mean_diff) < mean_tolerance &&
183               mean_abs_diff < mean_tolerance &&
184               std::abs(median_diff) <= diff_median_tolerance &&
185               std::abs(min_diff) <= diff_mean_tolerance &&
186               std::abs(max_diff) <= diff_mean_tolerance);
187 }
188 
GenerateValidShapeConfigurations(int filter_width,int filter_height,int depth_multiplier,int dilation_width_factor,int dilation_height_factor,RuntimeShape * input_shape_inference,RuntimeShape * filter_shape_inference,RuntimeShape * output_shape_inference,int * pad_width,int * pad_height,int * stride)189 bool GenerateValidShapeConfigurations(
190     int filter_width, int filter_height, int depth_multiplier,
191     int dilation_width_factor, int dilation_height_factor,
192     RuntimeShape* input_shape_inference, RuntimeShape* filter_shape_inference,
193     RuntimeShape* output_shape_inference, int* pad_width, int* pad_height,
194     int* stride) {
195   const int batch = UniformRandomInt(1, 3);
196   const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
197   const int input_width = UniformRandomInt(5, 50);
198   const int input_height = UniformRandomInt(5, 50);
199   *stride = UniformRandomInt(1, 2);
200   const bool test_pad = UniformRandomInt(0, 1);
201   const auto padding_type = test_pad ? PaddingType::kValid : PaddingType::kSame;
202 
203   const int output_depth = input_depth * depth_multiplier;
204 
205   input_shape_inference->BuildFrom(
206       {batch, input_height, input_width, input_depth});
207 
208   filter_shape_inference->BuildFrom(
209       {1, filter_height, filter_width, output_depth});
210 
211   EXPECT_TRUE(ComputeConvSizes(
212       *input_shape_inference, output_depth, filter_width, filter_height,
213       *stride, dilation_width_factor, dilation_height_factor, padding_type,
214       output_shape_inference, pad_width, pad_height));
215 
216   // We just care about whether the shape is suitable so we use non-per-channel
217   // case.
218   return optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
219       optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
220       *input_shape_inference, *filter_shape_inference, *stride, *stride,
221       dilation_width_factor, dilation_height_factor, *pad_width, *pad_height,
222       depth_multiplier, *output_shape_inference, 0);
223 }
224 
TryTestOneDepthwiseConv3x3Filter()225 void TryTestOneDepthwiseConv3x3Filter() {
226   const int filter_width = 3;
227   const int filter_height = 3;
228   const int depth_multiplier = 1;
229   // We don't support dilations in the 3x3 filter.
230   const int dilation_width_factor = 1;
231   const int dilation_height_factor = 1;
232 
233   const int output_activation_min = -128;
234   const int output_activation_max = 127;
235 
236   const std::int32_t input_offset = UniformRandomInt(-25, 25);
237   const std::int32_t output_offset = UniformRandomInt(-25, 25);
238 
239   RuntimeShape input_shape_inference;
240   RuntimeShape filter_shape_inference;
241   RuntimeShape output_shape_inference;
242   int pad_width, pad_height;
243   int stride;
244 
245   // Keeps trying until we get valid shape/configurations for 3x3 filter case.
246   bool generated_valid_configurations_for_3x3_kernel = false;
247   while (!generated_valid_configurations_for_3x3_kernel) {
248     generated_valid_configurations_for_3x3_kernel =
249         GenerateValidShapeConfigurations(
250             filter_width, filter_height, depth_multiplier,
251             dilation_width_factor, dilation_height_factor,
252             &input_shape_inference, &filter_shape_inference,
253             &output_shape_inference, &pad_width, &pad_height, &stride);
254   }
255 
256   const int output_depth = output_shape_inference.Dims(3);
257 
258   RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
259   const int input_buffer_size = input_shape_inference.FlatSize();
260   const int filter_buffer_size = filter_shape_inference.FlatSize();
261   const int output_buffer_size = output_shape_inference.FlatSize();
262   std::vector<std::int8_t> input_data(input_buffer_size);
263   std::vector<std::int8_t> filter_data(filter_buffer_size);
264   std::vector<std::int32_t> bias_data(output_depth);
265 
266   FillRandom(&input_data);
267   FillRandom(&filter_data);
268   FillRandom(&bias_data, -1000, 1000);
269 
270   DepthwiseParams params;
271   params.stride_width = stride;
272   params.stride_height = stride;
273   params.dilation_height_factor = dilation_height_factor;
274   params.dilation_width_factor = dilation_width_factor;
275   params.padding_values.width = pad_width;
276   params.padding_values.height = pad_height;
277   params.depth_multiplier = depth_multiplier;
278   params.input_offset = input_offset;
279   params.output_offset = output_offset;
280   params.weights_offset = 0;
281   params.quantized_activation_min = output_activation_min;
282   params.quantized_activation_max = output_activation_max;
283 
284   std::vector<std::int8_t> reference_output_data(output_buffer_size);
285   std::vector<std::int8_t> neon_output_data(output_buffer_size);
286 
287   std::vector<std::int32_t> output_multiplier(output_depth);
288   std::vector<std::int32_t> output_shift(output_depth);
289 
290   // It's hard to come up with a right multiplier, random guess basically makes
291   // all the results saturated and becomes meaningfulless, so we first use
292   // reference impl to poke the min/max value of the accumulation, then use that
293   // value as a guided suggestion for us to populate meaningful multiplier &
294   // shift.
295   PickReasonableMultiplier(
296       params, output_activation_min, output_activation_max, output_depth,
297       input_shape_inference, input_data.data(), filter_shape_inference,
298       filter_data.data(), bias_shape_inference, bias_data.data(),
299       output_shape_inference, output_multiplier.data(), output_shift.data(),
300       reference_output_data.data());
301 
302   EXPECT_TRUE(optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
303               optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
304       input_shape_inference, filter_shape_inference, stride, stride,
305       dilation_width_factor, dilation_height_factor, pad_width, pad_height,
306       depth_multiplier, output_shape_inference, 0, output_shift.data()));
307 
308   // The following tests compare reference impl and Neon general impl agrees,
309   // and reference impl loosely agrees with fast kernel since they use different
310   // rounding strategy.
311   reference_integer_ops::DepthwiseConvPerChannel(
312       params, output_multiplier.data(), output_shift.data(),
313       input_shape_inference, input_data.data(), filter_shape_inference,
314       filter_data.data(), bias_shape_inference, bias_data.data(),
315       output_shape_inference, reference_output_data.data());
316 
317   optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(
318       params, output_multiplier.data(), output_shift.data(),
319       input_shape_inference, input_data.data(), filter_shape_inference,
320       filter_data.data(), bias_shape_inference, bias_data.data(),
321       output_shape_inference, neon_output_data.data(),
322       /*thread_start=*/0,
323       /*thread_end=*/output_shape_inference.Dims(1), /*thread_dim=*/1);
324 
325   // We have changed our rounding strategy to the ARM rounding-right-shift
326   // instruction: breaking tie upward as it's much simpler.
327   // So we allow some difference for the neon output VS. the reference output.
328   CompareRoundingResults(output_buffer_size, depth_multiplier,
329                          reference_output_data.data(), neon_output_data.data());
330 
331 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
332   std::vector<std::int8_t> fast_kernel_output_data(output_buffer_size);
333   optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
334       DepthwiseConvOutputRounding::kUpward>(
335       params, output_multiplier.data(), output_shift.data(),
336       input_shape_inference, input_data.data(), filter_shape_inference,
337       filter_data.data(), bias_shape_inference, bias_data.data(),
338       output_shape_inference, fast_kernel_output_data.data(),
339       /*thread_start=*/0,
340       /*thread_end=*/output_shape_inference.Dims(1), /*thread_dim=*/1);
341 
342   CompareRoundingResults(output_buffer_size, depth_multiplier,
343                          reference_output_data.data(),
344                          fast_kernel_output_data.data());
345 #endif
346 }
347 
TEST(QuantizedDepthwiseConvPerChannelTest,FastKernelTest)348 TEST(QuantizedDepthwiseConvPerChannelTest, FastKernelTest) {
349   for (int i = 0; i < 60; ++i) {
350     TryTestOneDepthwiseConv3x3Filter();
351   }
352 }
353 
354 }  // namespace
355 }  // namespace tflite
356