1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <sys/types.h>
16
17 #include <algorithm>
18 #include <cmath>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 #include <type_traits>
25 #include <vector>
26
27 #include <gtest/gtest.h>
28 #include "tensorflow/lite/kernels/internal/common.h"
29 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
30 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
31 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
32 #include "tensorflow/lite/kernels/internal/quantization_util.h"
33 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
34 #include "tensorflow/lite/kernels/internal/test_util.h"
35 #include "tensorflow/lite/kernels/internal/types.h"
36
37 namespace tflite {
38 namespace {
39
PickOutputMultiplier(const DepthwiseParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,float * output_multiplier)40 void PickOutputMultiplier(
41 const DepthwiseParams& params, const RuntimeShape& input_shape,
42 const int8* input_data, const RuntimeShape& filter_shape,
43 const int8* filter_data, const RuntimeShape& bias_shape,
44 const int32* bias_data, const RuntimeShape& output_shape,
45 float* output_multiplier) {
46 const int stride_width = params.stride_width;
47 const int stride_height = params.stride_height;
48 const int dilation_width_factor = params.dilation_width_factor;
49 const int dilation_height_factor = params.dilation_height_factor;
50 const int pad_width = params.padding_values.width;
51 const int pad_height = params.padding_values.height;
52 const int depth_multiplier = params.depth_multiplier;
53 const int32 input_offset = params.input_offset;
54
55 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
56 const int input_height = input_shape.Dims(1);
57 const int input_width = input_shape.Dims(2);
58 const int input_depth = input_shape.Dims(3);
59 const int filter_height = filter_shape.Dims(1);
60 const int filter_width = filter_shape.Dims(2);
61 const int output_height = output_shape.Dims(1);
62 const int output_width = output_shape.Dims(2);
63
64 int output_accu_min = std::numeric_limits<std::int32_t>::max();
65 int output_accu_max = std::numeric_limits<std::int32_t>::min();
66
67 for (int batch = 0; batch < batches; ++batch) {
68 for (int out_y = 0; out_y < output_height; ++out_y) {
69 for (int out_x = 0; out_x < output_width; ++out_x) {
70 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
71 for (int m = 0; m < depth_multiplier; ++m) {
72 const int output_channel = m + in_channel * depth_multiplier;
73 const int in_x_origin = (out_x * stride_width) - pad_width;
74 const int in_y_origin = (out_y * stride_height) - pad_height;
75 int32 acc = 0;
76 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
77 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
78 const int in_x = in_x_origin + dilation_width_factor * filter_x;
79 const int in_y =
80 in_y_origin + dilation_height_factor * filter_y;
81 // Zero padding by omitting the areas outside the image.
82 const bool is_point_inside_image =
83 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
84 (in_y < input_height);
85 if (is_point_inside_image) {
86 int32 input_val = input_data[Offset(input_shape, batch, in_y,
87 in_x, in_channel)];
88 int32 filter_val = filter_data[Offset(
89 filter_shape, 0, filter_y, filter_x, output_channel)];
90 acc += filter_val * (input_val + input_offset);
91 }
92 }
93 }
94 if (bias_data) {
95 acc += bias_data[output_channel];
96 }
97 output_accu_max = std::max(acc, output_accu_max);
98 output_accu_min = std::min(acc, output_accu_min);
99 }
100 }
101 }
102 }
103 }
104
105 // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
106 // min/max fit in those ranges correspondingly as much as possible.
107 if (std::abs(output_accu_max) > std::abs(output_accu_min)) {
108 *output_multiplier = 127.0f / std::abs(output_accu_max);
109 } else {
110 *output_multiplier = 128.0f / std::abs(output_accu_min);
111 }
112 }
113
PickReasonableMultiplier(const DepthwiseParams & params,int output_activation_min,int output_activation_max,int output_depth,const RuntimeShape & input_shape_inference,const std::int8_t * input_data,const RuntimeShape & filter_shape_inference,const std::int8_t * filter_data,const RuntimeShape & bias_shape_inference,const std::int32_t * bias_data,const RuntimeShape & output_shape_inference,std::int32_t * output_multiplier_ptr,std::int32_t * output_shift_ptr,std::int8_t * output_data)114 void PickReasonableMultiplier(
115 const DepthwiseParams& params, int output_activation_min,
116 int output_activation_max, int output_depth,
117 const RuntimeShape& input_shape_inference, const std::int8_t* input_data,
118 const RuntimeShape& filter_shape_inference, const std::int8_t* filter_data,
119 const RuntimeShape& bias_shape_inference, const std::int32_t* bias_data,
120 const RuntimeShape& output_shape_inference,
121 std::int32_t* output_multiplier_ptr, std::int32_t* output_shift_ptr,
122 std::int8_t* output_data) {
123 float output_multiplier;
124 PickOutputMultiplier(params, input_shape_inference, input_data,
125 filter_shape_inference, filter_data,
126 bias_shape_inference, bias_data, output_shape_inference,
127 &output_multiplier);
128
129 int base_multiplier;
130 int base_shift;
131 QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
132 for (int i = 0; i < output_depth; ++i) {
133 // multipliers typically range in [2^30 ; 2^31 - 1].
134 // Values in [0, 2^30 - 1] are normally unused, but harmless.
135 // Thus a good way to randomize multipliers is to subtract from them
136 // a random value smaller than 2^30 but still significant compared to it.
137 output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
138 output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
139 }
140 }
141
142 // The reference implementation & the fast kernel have different rounding
143 // mechanism, so we loosely compare the difference.
CompareRoundingResults(int flat_size,const int depth_multiplier,const std::int8_t * reference_result,const std::int8_t * fast_kernel_result)144 void CompareRoundingResults(int flat_size, const int depth_multiplier,
145 const std::int8_t* reference_result,
146 const std::int8_t* fast_kernel_result) {
147 std::vector<int> diff(flat_size);
148 std::int64_t sum_diff = 0;
149 std::int64_t sum_abs_diff = 0;
150 for (int i = 0; i < flat_size; i++) {
151 diff[i] = static_cast<int>(fast_kernel_result[i]) -
152 static_cast<int>(reference_result[i]);
153 sum_diff += diff[i];
154 sum_abs_diff += std::abs(diff[i]);
155 }
156 // These stats help understand test failures.
157 std::sort(std::begin(diff), std::end(diff));
158 const int min_diff = diff.front();
159 const int max_diff = diff.back();
160 const int median_diff = diff[diff.size() / 2];
161 const float mean_diff = static_cast<float>(sum_diff) / flat_size;
162 const float mean_abs_diff = static_cast<float>(sum_abs_diff) / flat_size;
163
164 // The tolerance that we apply to means is tight, but we allow for a rounding
165 // difference in one pixel, and loosen by another 1% for float comparison.
166 const float mean_tolerance =
167 std::max(1e-2f, 1.01f / flat_size * std::sqrt(1.f * depth_multiplier));
168 const int diff_mean_tolerance = 256;
169 const int diff_median_tolerance = 225;
170
171 // Normally we should require bit-for-bit exact results. Unfortunately a bug
172 // in the Intel arm_neon_sse.h translation header that we use for x86 tests
173 // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
174 // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
175 // few off-by-one errors for now, yet still ensure that no more than a small
176 // minority of values are wrong.
177 EXPECT_LT(std::abs(mean_diff), mean_tolerance);
178 EXPECT_LT(mean_abs_diff, mean_tolerance);
179 EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
180 EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
181 EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
182 EXPECT_TRUE(std::abs(mean_diff) < mean_tolerance &&
183 mean_abs_diff < mean_tolerance &&
184 std::abs(median_diff) <= diff_median_tolerance &&
185 std::abs(min_diff) <= diff_mean_tolerance &&
186 std::abs(max_diff) <= diff_mean_tolerance);
187 }
188
GenerateValidShapeConfigurations(int filter_width,int filter_height,int depth_multiplier,int dilation_width_factor,int dilation_height_factor,RuntimeShape * input_shape_inference,RuntimeShape * filter_shape_inference,RuntimeShape * output_shape_inference,int * pad_width,int * pad_height,int * stride)189 bool GenerateValidShapeConfigurations(
190 int filter_width, int filter_height, int depth_multiplier,
191 int dilation_width_factor, int dilation_height_factor,
192 RuntimeShape* input_shape_inference, RuntimeShape* filter_shape_inference,
193 RuntimeShape* output_shape_inference, int* pad_width, int* pad_height,
194 int* stride) {
195 const int batch = UniformRandomInt(1, 3);
196 const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
197 const int input_width = UniformRandomInt(5, 50);
198 const int input_height = UniformRandomInt(5, 50);
199 *stride = UniformRandomInt(1, 2);
200 const bool test_pad = UniformRandomInt(0, 1);
201 const auto padding_type = test_pad ? PaddingType::kValid : PaddingType::kSame;
202
203 const int output_depth = input_depth * depth_multiplier;
204
205 input_shape_inference->BuildFrom(
206 {batch, input_height, input_width, input_depth});
207
208 filter_shape_inference->BuildFrom(
209 {1, filter_height, filter_width, output_depth});
210
211 EXPECT_TRUE(ComputeConvSizes(
212 *input_shape_inference, output_depth, filter_width, filter_height,
213 *stride, dilation_width_factor, dilation_height_factor, padding_type,
214 output_shape_inference, pad_width, pad_height));
215
216 // We just care about whether the shape is suitable so we use non-per-channel
217 // case.
218 return optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
219 optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
220 *input_shape_inference, *filter_shape_inference, *stride, *stride,
221 dilation_width_factor, dilation_height_factor, *pad_width, *pad_height,
222 depth_multiplier, *output_shape_inference, 0);
223 }
224
TryTestOneDepthwiseConv3x3Filter()225 void TryTestOneDepthwiseConv3x3Filter() {
226 const int filter_width = 3;
227 const int filter_height = 3;
228 const int depth_multiplier = 1;
229 // We don't support dilations in the 3x3 filter.
230 const int dilation_width_factor = 1;
231 const int dilation_height_factor = 1;
232
233 const int output_activation_min = -128;
234 const int output_activation_max = 127;
235
236 const std::int32_t input_offset = UniformRandomInt(-25, 25);
237 const std::int32_t output_offset = UniformRandomInt(-25, 25);
238
239 RuntimeShape input_shape_inference;
240 RuntimeShape filter_shape_inference;
241 RuntimeShape output_shape_inference;
242 int pad_width, pad_height;
243 int stride;
244
245 // Keeps trying until we get valid shape/configurations for 3x3 filter case.
246 bool generated_valid_configurations_for_3x3_kernel = false;
247 while (!generated_valid_configurations_for_3x3_kernel) {
248 generated_valid_configurations_for_3x3_kernel =
249 GenerateValidShapeConfigurations(
250 filter_width, filter_height, depth_multiplier,
251 dilation_width_factor, dilation_height_factor,
252 &input_shape_inference, &filter_shape_inference,
253 &output_shape_inference, &pad_width, &pad_height, &stride);
254 }
255
256 const int output_depth = output_shape_inference.Dims(3);
257
258 RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
259 const int input_buffer_size = input_shape_inference.FlatSize();
260 const int filter_buffer_size = filter_shape_inference.FlatSize();
261 const int output_buffer_size = output_shape_inference.FlatSize();
262 std::vector<std::int8_t> input_data(input_buffer_size);
263 std::vector<std::int8_t> filter_data(filter_buffer_size);
264 std::vector<std::int32_t> bias_data(output_depth);
265
266 FillRandom(&input_data);
267 FillRandom(&filter_data);
268 FillRandom(&bias_data, -1000, 1000);
269
270 DepthwiseParams params;
271 params.stride_width = stride;
272 params.stride_height = stride;
273 params.dilation_height_factor = dilation_height_factor;
274 params.dilation_width_factor = dilation_width_factor;
275 params.padding_values.width = pad_width;
276 params.padding_values.height = pad_height;
277 params.depth_multiplier = depth_multiplier;
278 params.input_offset = input_offset;
279 params.output_offset = output_offset;
280 params.weights_offset = 0;
281 params.quantized_activation_min = output_activation_min;
282 params.quantized_activation_max = output_activation_max;
283
284 std::vector<std::int8_t> reference_output_data(output_buffer_size);
285 std::vector<std::int8_t> neon_output_data(output_buffer_size);
286
287 std::vector<std::int32_t> output_multiplier(output_depth);
288 std::vector<std::int32_t> output_shift(output_depth);
289
290 // It's hard to come up with a right multiplier, random guess basically makes
291 // all the results saturated and becomes meaningfulless, so we first use
292 // reference impl to poke the min/max value of the accumulation, then use that
293 // value as a guided suggestion for us to populate meaningful multiplier &
294 // shift.
295 PickReasonableMultiplier(
296 params, output_activation_min, output_activation_max, output_depth,
297 input_shape_inference, input_data.data(), filter_shape_inference,
298 filter_data.data(), bias_shape_inference, bias_data.data(),
299 output_shape_inference, output_multiplier.data(), output_shift.data(),
300 reference_output_data.data());
301
302 EXPECT_TRUE(optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
303 optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
304 input_shape_inference, filter_shape_inference, stride, stride,
305 dilation_width_factor, dilation_height_factor, pad_width, pad_height,
306 depth_multiplier, output_shape_inference, 0, output_shift.data()));
307
308 // The following tests compare reference impl and Neon general impl agrees,
309 // and reference impl loosely agrees with fast kernel since they use different
310 // rounding strategy.
311 reference_integer_ops::DepthwiseConvPerChannel(
312 params, output_multiplier.data(), output_shift.data(),
313 input_shape_inference, input_data.data(), filter_shape_inference,
314 filter_data.data(), bias_shape_inference, bias_data.data(),
315 output_shape_inference, reference_output_data.data());
316
317 optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(
318 params, output_multiplier.data(), output_shift.data(),
319 input_shape_inference, input_data.data(), filter_shape_inference,
320 filter_data.data(), bias_shape_inference, bias_data.data(),
321 output_shape_inference, neon_output_data.data(),
322 /*thread_start=*/0,
323 /*thread_end=*/output_shape_inference.Dims(1), /*thread_dim=*/1);
324
325 // We have changed our rounding strategy to the ARM rounding-right-shift
326 // instruction: breaking tie upward as it's much simpler.
327 // So we allow some difference for the neon output VS. the reference output.
328 CompareRoundingResults(output_buffer_size, depth_multiplier,
329 reference_output_data.data(), neon_output_data.data());
330
331 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
332 std::vector<std::int8_t> fast_kernel_output_data(output_buffer_size);
333 optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
334 DepthwiseConvOutputRounding::kUpward>(
335 params, output_multiplier.data(), output_shift.data(),
336 input_shape_inference, input_data.data(), filter_shape_inference,
337 filter_data.data(), bias_shape_inference, bias_data.data(),
338 output_shape_inference, fast_kernel_output_data.data(),
339 /*thread_start=*/0,
340 /*thread_end=*/output_shape_inference.Dims(1), /*thread_dim=*/1);
341
342 CompareRoundingResults(output_buffer_size, depth_multiplier,
343 reference_output_data.data(),
344 fast_kernel_output_data.data());
345 #endif
346 }
347
TEST(QuantizedDepthwiseConvPerChannelTest,FastKernelTest)348 TEST(QuantizedDepthwiseConvPerChannelTest, FastKernelTest) {
349 for (int i = 0; i < 60; ++i) {
350 TryTestOneDepthwiseConv3x3Filter();
351 }
352 }
353
354 } // namespace
355 } // namespace tflite
356