1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
17
18 #include "tensorflow/lite/kernels/internal/common.h"
19
20 namespace tflite {
21 namespace reference_integer_ops {
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,int8_t * output_data)22 inline void DepthwiseConvPerChannel(
23 const DepthwiseParams& params, const int32_t* output_multiplier,
24 const int32_t* output_shift, const RuntimeShape& input_shape,
25 const int8_t* input_data, const RuntimeShape& filter_shape,
26 const int8_t* filter_data, const RuntimeShape& bias_shape,
27 const int32_t* bias_data, const RuntimeShape& output_shape,
28 int8_t* output_data) {
29 // Get parameters.
30 // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
31 const int stride_width = params.stride_width;
32 const int stride_height = params.stride_height;
33 const int dilation_width_factor = params.dilation_width_factor;
34 const int dilation_height_factor = params.dilation_height_factor;
35 const int pad_width = params.padding_values.width;
36 const int pad_height = params.padding_values.height;
37 const int depth_multiplier = params.depth_multiplier;
38 const int32_t input_offset = params.input_offset;
39 const int32_t output_offset = params.output_offset;
40 const int32_t output_activation_min = params.quantized_activation_min;
41 const int32_t output_activation_max = params.quantized_activation_max;
42
43 // Check dimensions of the tensors.
44 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
45 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
46 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
47
48 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
49 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
50 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
51 const int input_height = input_shape.Dims(1);
52 const int input_width = input_shape.Dims(2);
53 const int input_depth = input_shape.Dims(3);
54 const int filter_height = filter_shape.Dims(1);
55 const int filter_width = filter_shape.Dims(2);
56 const int output_height = output_shape.Dims(1);
57 const int output_width = output_shape.Dims(2);
58 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
59 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
60
61 for (int batch = 0; batch < batches; ++batch) {
62 for (int out_y = 0; out_y < output_height; ++out_y) {
63 for (int out_x = 0; out_x < output_width; ++out_x) {
64 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
65 for (int m = 0; m < depth_multiplier; ++m) {
66 const int output_channel = m + in_channel * depth_multiplier;
67 const int in_x_origin = (out_x * stride_width) - pad_width;
68 const int in_y_origin = (out_y * stride_height) - pad_height;
69 int32_t acc = 0;
70 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
71 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
72 const int in_x = in_x_origin + dilation_width_factor * filter_x;
73 const int in_y =
74 in_y_origin + dilation_height_factor * filter_y;
75 // Zero padding by omitting the areas outside the image.
76 const bool is_point_inside_image =
77 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
78 (in_y < input_height);
79 if (is_point_inside_image) {
80 int32_t input_val = input_data[Offset(
81 input_shape, batch, in_y, in_x, in_channel)];
82 int32_t filter_val = filter_data[Offset(
83 filter_shape, 0, filter_y, filter_x, output_channel)];
84 // Accumulate with 32 bits accumulator.
85 // In the nudging process during model quantization, we force
86 // real value of 0.0 be represented by a quantized value. This
87 // guarantees that the input_offset is a int8_t, even though
88 // it is represented using int32_t. int32_t += int8_t *
89 // (int8_t - int8_t) so the highest value we can get from each
90 // accumulation is [-127, 127] * ([-128, 127] -
91 // [-128, 127]), which is [-32512, 32512]. log2(32512)
92 // = 14.98, which means we can accumulate at least 2^16
93 // multiplications without overflow. The accumulator is
94 // applied to a filter so the accumulation logic will hold as
95 // long as the filter size (filter_y * filter_x * in_channel)
96 // does not exceed 2^16, which is the case in all the models
97 // we have seen so far.
98 // TODO(b/174275578): Add a check to make sure the
99 // accumulator depth is smaller than 2^16.
100 acc += filter_val * (input_val + input_offset);
101 }
102 }
103 }
104 if (bias_data) {
105 acc += bias_data[output_channel];
106 }
107 acc = MultiplyByQuantizedMultiplier(
108 acc, output_multiplier[output_channel],
109 output_shift[output_channel]);
110 acc += output_offset;
111 acc = std::max(acc, output_activation_min);
112 acc = std::min(acc, output_activation_max);
113 output_data[Offset(output_shape, batch, out_y, out_x,
114 output_channel)] = static_cast<int8_t>(acc);
115 }
116 }
117 }
118 }
119 }
120 }
121
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int16_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const std::int64_t * bias_data,const RuntimeShape & output_shape,int16_t * output_data)122 inline void DepthwiseConvPerChannel(
123 const DepthwiseParams& params, const int32_t* output_multiplier,
124 const int32_t* output_shift, const RuntimeShape& input_shape,
125 const int16_t* input_data, const RuntimeShape& filter_shape,
126 const int8_t* filter_data, const RuntimeShape& bias_shape,
127 const std::int64_t* bias_data, const RuntimeShape& output_shape,
128 int16_t* output_data) {
129 // Get parameters.
130 const int stride_width = params.stride_width;
131 const int stride_height = params.stride_height;
132 const int dilation_width_factor = params.dilation_width_factor;
133 const int dilation_height_factor = params.dilation_height_factor;
134 const int pad_width = params.padding_values.width;
135 const int pad_height = params.padding_values.height;
136 const int depth_multiplier = params.depth_multiplier;
137 const int32_t output_activation_min = params.quantized_activation_min;
138 const int32_t output_activation_max = params.quantized_activation_max;
139
140 // Check dimensions of the tensors.
141 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
142 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
143 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
144
145 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
146 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
147 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
148 const int input_height = input_shape.Dims(1);
149 const int input_width = input_shape.Dims(2);
150 const int input_depth = input_shape.Dims(3);
151 const int filter_height = filter_shape.Dims(1);
152 const int filter_width = filter_shape.Dims(2);
153 const int output_height = output_shape.Dims(1);
154 const int output_width = output_shape.Dims(2);
155 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
156 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
157
158 for (int batch = 0; batch < batches; ++batch) {
159 for (int out_y = 0; out_y < output_height; ++out_y) {
160 for (int out_x = 0; out_x < output_width; ++out_x) {
161 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
162 for (int m = 0; m < depth_multiplier; ++m) {
163 const int output_channel = m + in_channel * depth_multiplier;
164 const int in_x_origin = (out_x * stride_width) - pad_width;
165 const int in_y_origin = (out_y * stride_height) - pad_height;
166 std::int64_t acc = 0;
167 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
168 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
169 const int in_x = in_x_origin + dilation_width_factor * filter_x;
170 const int in_y =
171 in_y_origin + dilation_height_factor * filter_y;
172 // Zero padding by omitting the areas outside the image.
173 const bool is_point_inside_image =
174 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
175 (in_y < input_height);
176 if (is_point_inside_image) {
177 int32_t input_val = input_data[Offset(
178 input_shape, batch, in_y, in_x, in_channel)];
179 int32_t filter_val = filter_data[Offset(
180 filter_shape, 0, filter_y, filter_x, output_channel)];
181 // Accumulate with 64 bits accumulator.
182 // We assume maximum of 2^16 accumulations as with the 8-bit
183 // case so actually the value in the accumulator should not
184 // exceed 40 bits
185 acc += static_cast<int64_t>(filter_val) *
186 static_cast<int64_t>(input_val);
187 }
188 }
189 }
190 if (bias_data) {
191 acc += bias_data[output_channel];
192 }
193 int32_t scaled_acc = MultiplyByQuantizedMultiplier(
194 acc, output_multiplier[output_channel],
195 output_shift[output_channel]);
196 scaled_acc = std::max(scaled_acc, output_activation_min);
197 scaled_acc = std::min(scaled_acc, output_activation_max);
198 output_data[Offset(output_shape, batch, out_y, out_x,
199 output_channel)] =
200 static_cast<int16_t>(scaled_acc);
201 }
202 }
203 }
204 }
205 }
206 }
207
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,float * scaling_factors_ptr,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scale,int32_t * input_offset)208 inline void DepthwiseConvHybridPerChannel(
209 const DepthwiseParams& params, float* scaling_factors_ptr,
210 const RuntimeShape& input_shape, const int8_t* input_data,
211 const RuntimeShape& filter_shape, const int8_t* filter_data,
212 const RuntimeShape& bias_shape, const float* bias_data,
213 const RuntimeShape& output_shape, float* output_data,
214 const float* per_channel_scale, int32_t* input_offset) {
215 const int stride_width = params.stride_width;
216 const int stride_height = params.stride_height;
217 const int dilation_width_factor = params.dilation_width_factor;
218 const int dilation_height_factor = params.dilation_height_factor;
219 const int pad_width = params.padding_values.width;
220 const int pad_height = params.padding_values.height;
221 const int depth_multiplier = params.depth_multiplier;
222 const float output_activation_min = params.float_activation_min;
223 const float output_activation_max = params.float_activation_max;
224 // Check dimensions of the tensors.
225 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
226 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
227 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
228
229 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
230 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
231 const int input_height = input_shape.Dims(1);
232 const int input_width = input_shape.Dims(2);
233 const int input_depth = input_shape.Dims(3);
234 const int filter_height = filter_shape.Dims(1);
235 const int filter_width = filter_shape.Dims(2);
236 const int output_height = output_shape.Dims(1);
237 const int output_width = output_shape.Dims(2);
238 const int bias_depth = bias_shape.FlatSize();
239 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
240 TFLITE_DCHECK_EQ(bias_depth, output_depth);
241
242 for (int batch = 0; batch < batches; ++batch) {
243 for (int out_y = 0; out_y < output_height; ++out_y) {
244 for (int out_x = 0; out_x < output_width; ++out_x) {
245 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
246 for (int m = 0; m < depth_multiplier; ++m) {
247 const int output_channel = m + in_channel * depth_multiplier;
248 const int in_x_origin = (out_x * stride_width) - pad_width;
249 const int in_y_origin = (out_y * stride_height) - pad_height;
250 int32_t acc = 0;
251 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
252 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
253 const int in_x = in_x_origin + dilation_width_factor * filter_x;
254 const int in_y =
255 in_y_origin + dilation_height_factor * filter_y;
256 // Zero padding by omitting the areas outside the image.
257 const bool is_point_inside_image =
258 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
259 (in_y < input_height);
260 if (is_point_inside_image) {
261 int32_t input_val = input_data[Offset(
262 input_shape, batch, in_y, in_x, in_channel)];
263 int32_t filter_val = filter_data[Offset(
264 filter_shape, 0, filter_y, filter_x, output_channel)];
265 acc += filter_val * (input_val - input_offset[batch]);
266 }
267 }
268 }
269 float acc_float = static_cast<float>(acc);
270 acc_float *=
271 per_channel_scale[output_channel] * scaling_factors_ptr[batch];
272 if (bias_data && output_channel < bias_depth) {
273 acc_float += bias_data[output_channel];
274 }
275 output_data[Offset(output_shape, batch, out_y, out_x,
276 output_channel)] =
277 ActivationFunctionWithMinMax(acc_float, output_activation_min,
278 output_activation_max);
279 }
280 }
281 }
282 }
283 }
284 }
285
286 } // namespace reference_integer_ops
287 } // namespace tflite
288
289 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
290