1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
17
18 #include <algorithm>
19
20 #include "fixedpoint/fixedpoint.h"
21 #include "tensorflow/lite/kernels/internal/common.h"
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/types.h"
24
25 namespace tflite {
26
27 // Used in tests and template parameters to control which version of depthwise
28 // convolution is called. Primarily for reference code, and specializations
29 // forced in tests.
30 enum class DepthwiseConvImplementation {
31 // Run all tests against kUseStandardEntry even if also testing another
32 // kernel, since we need to be sure that the main DepthwiseConv() function in
33 // optimized_ops.h dispatches to a correctly-executing kernel.
34 kNone = 0, // The "default" option: use the normal
35 // DepthwiseConv kernel (entry) function.
36 kUseGenericKernel, // Forced use of generic kernel.
37 kUseNeon3x3, // 3x3 kernel that uses NEON when available.
38 kUseNeon3x3DotProduct, // 3x3 kernel that uses dot-product enabled NEON
39 // when available.
40 kUseCModel3x3DotProduct, // 3x3 kernel, reference C model that is intended
41 // to match overall design NEON code.
42 kUseUnwound3x3DotProduct, // 3x3 kernel, reference C model with unwound loops
43 // and some arrays.
44 kUseIntrinsics3x3DotProduct, // 3x3 kernel using NEON intrinsics.
45 };
46
47 // Category of depthwise convolution output rounding.
48 enum class DepthwiseConvOutputRounding {
49 kNone = 0, // Invalid: specific method must be specified.
50 kAwayFromZero, // Original method: exact halves rounded away from zero.
51 kUpward, // Halves towards +infinity: adds 0.5 before truncate.
52 // This is where a future kNearestEven would be placed.
53 };
54
55 // Category of depthwise convolution depth multiplication.
56 enum class DepthwiseConvDepthMultiplication {
57 kNoMultiplication = 0, // Depth multiplier = 1.
58 kUnitInputDepth, // Input depth = 1, output depth = depth multiplier.
59 };
60
61 namespace reference_ops {
62 namespace depthwise_conv {
63
64 template <DepthwiseConvOutputRounding output_rounding>
DepthwiseConvRound(int32_t x,int32_t quantized_multiplier,int shift)65 inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
66 int shift) {
67 TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
68 return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
69 }
70
71 template <>
72 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
73 int32_t x, int32_t quantized_multiplier, int shift) {
74 return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
75 }
76
77 template <>
78 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
79 int32_t x, int32_t quantized_multiplier, int shift) {
80 using gemmlowp::SaturatingRoundingDoublingHighMul;
81 const int left_shift = shift > 0 ? shift : 0;
82 const int right_shift = shift > 0 ? 0 : -shift;
83 const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
84 return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
85 quantized_multiplier) +
86 rounding_offset) >>
87 right_shift;
88 }
89
90 template <DepthwiseConvOutputRounding output_rounding>
91 struct DepthwiseConvBasicKernel {
RunDepthwiseConvBasicKernel92 static inline void Run(
93 const DepthwiseParams& params, const RuntimeShape& input_shape,
94 const uint8_t* input_data, const RuntimeShape& filter_shape,
95 const uint8_t* filter_data, const RuntimeShape& bias_shape,
96 const int32_t* bias_data, const RuntimeShape& output_shape,
97 uint8_t* output_data) {
98 const int stride_width = params.stride_width;
99 const int stride_height = params.stride_height;
100 const int dilation_width_factor = params.dilation_width_factor;
101 const int dilation_height_factor = params.dilation_height_factor;
102 const int pad_width = params.padding_values.width;
103 const int pad_height = params.padding_values.height;
104 const int depth_multiplier = params.depth_multiplier;
105 const int32_t output_activation_min = params.quantized_activation_min;
106 const int32_t output_activation_max = params.quantized_activation_max;
107 const int32_t input_offset = params.input_offset;
108 const int32_t filter_offset = params.weights_offset;
109 const int32_t output_offset = params.output_offset;
110 const int32_t output_multiplier = params.output_multiplier;
111 const int output_shift = params.output_shift;
112 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
113 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
114 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
115
116 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
117 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
118 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
119 const int input_height = input_shape.Dims(1);
120 const int input_width = input_shape.Dims(2);
121 const int input_depth = input_shape.Dims(3);
122 const int filter_height = filter_shape.Dims(1);
123 const int filter_width = filter_shape.Dims(2);
124 const int output_height = output_shape.Dims(1);
125 const int output_width = output_shape.Dims(2);
126 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
127 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
128
129 for (int b = 0; b < batches; ++b) {
130 for (int out_y = 0; out_y < output_height; ++out_y) {
131 for (int out_x = 0; out_x < output_width; ++out_x) {
132 for (int ic = 0; ic < input_depth; ++ic) {
133 for (int m = 0; m < depth_multiplier; m++) {
134 const int oc = m + ic * depth_multiplier;
135 const int in_x_origin = (out_x * stride_width) - pad_width;
136 const int in_y_origin = (out_y * stride_height) - pad_height;
137 int32_t acc = 0;
138 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
139 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
140 const int in_x =
141 in_x_origin + dilation_width_factor * filter_x;
142 const int in_y =
143 in_y_origin + dilation_height_factor * filter_y;
144 // If the location is outside the bounds of the input image,
145 // use zero as a default value.
146 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
147 (in_y < input_height)) {
148 int32_t input_val =
149 input_data[Offset(input_shape, b, in_y, in_x, ic)];
150 int32_t filter_val = filter_data[Offset(
151 filter_shape, 0, filter_y, filter_x, oc)];
152 acc += (filter_val + filter_offset) *
153 (input_val + input_offset);
154 }
155 }
156 }
157 if (bias_data) {
158 acc += bias_data[oc];
159 }
160 acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
161 output_shift);
162 acc += output_offset;
163 acc = std::max(acc, output_activation_min);
164 acc = std::min(acc, output_activation_max);
165 output_data[Offset(output_shape, b, out_y, out_x, oc)] =
166 static_cast<uint8_t>(acc);
167 }
168 }
169 }
170 }
171 }
172 }
173
174 // TODO(b/148596273): Reconcile reference versions, perhaps with common
175 // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
RunPerChannelDepthwiseConvBasicKernel176 static inline void RunPerChannel(
177 const DepthwiseParams& params, const RuntimeShape& input_shape,
178 const int8_t* input_data, const RuntimeShape& filter_shape,
179 const int8_t* filter_data, const RuntimeShape& bias_shape,
180 const int32_t* bias_data, const RuntimeShape& output_shape,
181 int8_t* output_data) {
182 // Get parameters.
183 // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
184 const int stride_width = params.stride_width;
185 const int stride_height = params.stride_height;
186 const int dilation_width_factor = params.dilation_width_factor;
187 const int dilation_height_factor = params.dilation_height_factor;
188 const int pad_width = params.padding_values.width;
189 const int pad_height = params.padding_values.height;
190 const int depth_multiplier = params.depth_multiplier;
191 const int32_t input_offset = params.input_offset;
192 const int32_t output_offset = params.output_offset;
193 const int32_t output_activation_min = params.quantized_activation_min;
194 const int32_t output_activation_max = params.quantized_activation_max;
195 const int32_t* output_multiplier = params.output_multiplier_per_channel;
196 const int32_t* output_shift = params.output_shift_per_channel;
197
198 // Check dimensions of the tensors.
199 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
200 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
201 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
202
203 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
204 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
205 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
206 const int input_height = input_shape.Dims(1);
207 const int input_width = input_shape.Dims(2);
208 const int input_depth = input_shape.Dims(3);
209 const int filter_height = filter_shape.Dims(1);
210 const int filter_width = filter_shape.Dims(2);
211 const int output_height = output_shape.Dims(1);
212 const int output_width = output_shape.Dims(2);
213 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
214 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
215
216 for (int batch = 0; batch < batches; ++batch) {
217 for (int out_y = 0; out_y < output_height; ++out_y) {
218 for (int out_x = 0; out_x < output_width; ++out_x) {
219 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
220 for (int m = 0; m < depth_multiplier; ++m) {
221 const int output_channel = m + in_channel * depth_multiplier;
222 const int in_x_origin = (out_x * stride_width) - pad_width;
223 const int in_y_origin = (out_y * stride_height) - pad_height;
224 int32_t acc = 0;
225 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
226 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
227 const int in_x =
228 in_x_origin + dilation_width_factor * filter_x;
229 const int in_y =
230 in_y_origin + dilation_height_factor * filter_y;
231 // Zero padding by omitting the areas outside the image.
232 const bool is_point_inside_image =
233 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
234 (in_y < input_height);
235 if (is_point_inside_image) {
236 int32_t input_val = input_data[Offset(
237 input_shape, batch, in_y, in_x, in_channel)];
238 int32_t filter_val = filter_data[Offset(
239 filter_shape, 0, filter_y, filter_x, output_channel)];
240 // Accumulate with 32 bits accumulator.
241 // In the nudging process during model quantization, we
242 // force real value of 0.0 be represented by a quantized
243 // value. This guarantees that the input_offset is a int8_t,
244 // even though it is represented using int32_t. int32_t +=
245 // int8_t
246 // * (int8_t - int8_t) so the highest value we can get from
247 // each accumulation is [-127, 127] * ([-128, 127] -
248 // [-128, 127]), which is [-32512, 32512]. log2(32512)
249 // = 14.98, which means we can accumulate at least 2^16
250 // multiplications without overflow. The accumulator is
251 // applied to a filter so the accumulation logic will hold
252 // as long as the filter size (filter_y * filter_x *
253 // in_channel) does not exceed 2^16, which is the case in
254 // all the models we have seen so far.
255 acc += filter_val * (input_val + input_offset);
256 }
257 }
258 }
259 if (bias_data) {
260 acc += bias_data[output_channel];
261 }
262 acc = DepthwiseConvRound<output_rounding>(
263 acc, output_multiplier[output_channel],
264 output_shift[output_channel]);
265 acc += output_offset;
266 acc = std::max(acc, output_activation_min);
267 acc = std::min(acc, output_activation_max);
268 output_data[Offset(output_shape, batch, out_y, out_x,
269 output_channel)] = static_cast<int8_t>(acc);
270 }
271 }
272 }
273 }
274 }
275 }
276 };
277
278 } // namespace depthwise_conv
279
DepthwiseConv(const DepthwiseParams & params,const RuntimeShape & input_shape,const uint8_t * input_data,const RuntimeShape & filter_shape,const uint8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,uint8_t * output_data)280 inline void DepthwiseConv(
281 const DepthwiseParams& params, const RuntimeShape& input_shape,
282 const uint8_t* input_data, const RuntimeShape& filter_shape,
283 const uint8_t* filter_data, const RuntimeShape& bias_shape,
284 const int32_t* bias_data, const RuntimeShape& output_shape,
285 uint8_t* output_data) {
286 return depthwise_conv::DepthwiseConvBasicKernel<
287 DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
288 input_data, filter_shape,
289 filter_data, bias_shape,
290 bias_data, output_shape,
291 output_data);
292 }
293
294 } // namespace reference_ops
295 } // end namespace tflite
296
297 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
298