• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
17 
18 #include "tensorflow/lite/kernels/internal/common.h"
19 
20 namespace tflite {
21 namespace reference_integer_ops {
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,int8_t * output_data)22 inline void DepthwiseConvPerChannel(
23     const DepthwiseParams& params, const int32_t* output_multiplier,
24     const int32_t* output_shift, const RuntimeShape& input_shape,
25     const int8_t* input_data, const RuntimeShape& filter_shape,
26     const int8_t* filter_data, const RuntimeShape& bias_shape,
27     const int32_t* bias_data, const RuntimeShape& output_shape,
28     int8_t* output_data) {
29   // Get parameters.
30   // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
31   const int stride_width = params.stride_width;
32   const int stride_height = params.stride_height;
33   const int dilation_width_factor = params.dilation_width_factor;
34   const int dilation_height_factor = params.dilation_height_factor;
35   const int pad_width = params.padding_values.width;
36   const int pad_height = params.padding_values.height;
37   const int depth_multiplier = params.depth_multiplier;
38   const int32_t input_offset = params.input_offset;
39   const int32_t output_offset = params.output_offset;
40   const int32_t output_activation_min = params.quantized_activation_min;
41   const int32_t output_activation_max = params.quantized_activation_max;
42 
43   // Check dimensions of the tensors.
44   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
45   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
46   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
47 
48   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
49   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
50   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
51   const int input_height = input_shape.Dims(1);
52   const int input_width = input_shape.Dims(2);
53   const int input_depth = input_shape.Dims(3);
54   const int filter_height = filter_shape.Dims(1);
55   const int filter_width = filter_shape.Dims(2);
56   const int output_height = output_shape.Dims(1);
57   const int output_width = output_shape.Dims(2);
58   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
59   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
60 
61   for (int batch = 0; batch < batches; ++batch) {
62     for (int out_y = 0; out_y < output_height; ++out_y) {
63       for (int out_x = 0; out_x < output_width; ++out_x) {
64         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
65           for (int m = 0; m < depth_multiplier; ++m) {
66             const int output_channel = m + in_channel * depth_multiplier;
67             const int in_x_origin = (out_x * stride_width) - pad_width;
68             const int in_y_origin = (out_y * stride_height) - pad_height;
69             int32_t acc = 0;
70             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
71               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
72                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
73                 const int in_y =
74                     in_y_origin + dilation_height_factor * filter_y;
75                 // Zero padding by omitting the areas outside the image.
76                 const bool is_point_inside_image =
77                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
78                     (in_y < input_height);
79                 if (is_point_inside_image) {
80                   int32_t input_val = input_data[Offset(
81                       input_shape, batch, in_y, in_x, in_channel)];
82                   int32_t filter_val = filter_data[Offset(
83                       filter_shape, 0, filter_y, filter_x, output_channel)];
84                   // Accumulate with 32 bits accumulator.
85                   // In the nudging process during model quantization, we force
86                   // real value of 0.0 be represented by a quantized value. This
87                   // guarantees that the input_offset is a int8_t, even though
88                   // it is represented using int32_t. int32_t += int8_t *
89                   // (int8_t - int8_t) so the highest value we can get from each
90                   // accumulation is [-127, 127] * ([-128, 127] -
91                   // [-128, 127]), which is [-32512, 32512]. log2(32512)
92                   // = 14.98, which means we can accumulate at least 2^16
93                   // multiplications without overflow. The accumulator is
94                   // applied to a filter so the accumulation logic will hold as
95                   // long as the filter size (filter_y * filter_x * in_channel)
96                   // does not exceed 2^16, which is the case in all the models
97                   // we have seen so far.
98                   // TODO(b/174275578): Add a check to make sure the
99                   // accumulator depth is smaller than 2^16.
100                   acc += filter_val * (input_val + input_offset);
101                 }
102               }
103             }
104             if (bias_data) {
105               acc += bias_data[output_channel];
106             }
107             acc = MultiplyByQuantizedMultiplier(
108                 acc, output_multiplier[output_channel],
109                 output_shift[output_channel]);
110             acc += output_offset;
111             acc = std::max(acc, output_activation_min);
112             acc = std::min(acc, output_activation_max);
113             output_data[Offset(output_shape, batch, out_y, out_x,
114                                output_channel)] = static_cast<int8_t>(acc);
115           }
116         }
117       }
118     }
119   }
120 }
121 
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32_t * output_multiplier,const int32_t * output_shift,const RuntimeShape & input_shape,const int16_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const std::int64_t * bias_data,const RuntimeShape & output_shape,int16_t * output_data)122 inline void DepthwiseConvPerChannel(
123     const DepthwiseParams& params, const int32_t* output_multiplier,
124     const int32_t* output_shift, const RuntimeShape& input_shape,
125     const int16_t* input_data, const RuntimeShape& filter_shape,
126     const int8_t* filter_data, const RuntimeShape& bias_shape,
127     const std::int64_t* bias_data, const RuntimeShape& output_shape,
128     int16_t* output_data) {
129   // Get parameters.
130   const int stride_width = params.stride_width;
131   const int stride_height = params.stride_height;
132   const int dilation_width_factor = params.dilation_width_factor;
133   const int dilation_height_factor = params.dilation_height_factor;
134   const int pad_width = params.padding_values.width;
135   const int pad_height = params.padding_values.height;
136   const int depth_multiplier = params.depth_multiplier;
137   const int32_t output_activation_min = params.quantized_activation_min;
138   const int32_t output_activation_max = params.quantized_activation_max;
139 
140   // Check dimensions of the tensors.
141   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
142   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
143   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
144 
145   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
146   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
147   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
148   const int input_height = input_shape.Dims(1);
149   const int input_width = input_shape.Dims(2);
150   const int input_depth = input_shape.Dims(3);
151   const int filter_height = filter_shape.Dims(1);
152   const int filter_width = filter_shape.Dims(2);
153   const int output_height = output_shape.Dims(1);
154   const int output_width = output_shape.Dims(2);
155   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
156   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
157 
158   for (int batch = 0; batch < batches; ++batch) {
159     for (int out_y = 0; out_y < output_height; ++out_y) {
160       for (int out_x = 0; out_x < output_width; ++out_x) {
161         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
162           for (int m = 0; m < depth_multiplier; ++m) {
163             const int output_channel = m + in_channel * depth_multiplier;
164             const int in_x_origin = (out_x * stride_width) - pad_width;
165             const int in_y_origin = (out_y * stride_height) - pad_height;
166             std::int64_t acc = 0;
167             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
168               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
169                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
170                 const int in_y =
171                     in_y_origin + dilation_height_factor * filter_y;
172                 // Zero padding by omitting the areas outside the image.
173                 const bool is_point_inside_image =
174                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
175                     (in_y < input_height);
176                 if (is_point_inside_image) {
177                   int32_t input_val = input_data[Offset(
178                       input_shape, batch, in_y, in_x, in_channel)];
179                   int32_t filter_val = filter_data[Offset(
180                       filter_shape, 0, filter_y, filter_x, output_channel)];
181                   // Accumulate with 64 bits accumulator.
182                   // We assume maximum of 2^16 accumulations as with the 8-bit
183                   // case so actually the value in the accumulator should not
184                   // exceed 40 bits
185                   acc += static_cast<int64_t>(filter_val) *
186                          static_cast<int64_t>(input_val);
187                 }
188               }
189             }
190             if (bias_data) {
191               acc += bias_data[output_channel];
192             }
193             int32_t scaled_acc = MultiplyByQuantizedMultiplier(
194                 acc, output_multiplier[output_channel],
195                 output_shift[output_channel]);
196             scaled_acc = std::max(scaled_acc, output_activation_min);
197             scaled_acc = std::min(scaled_acc, output_activation_max);
198             output_data[Offset(output_shape, batch, out_y, out_x,
199                                output_channel)] =
200                 static_cast<int16_t>(scaled_acc);
201           }
202         }
203       }
204     }
205   }
206 }
207 
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,float * scaling_factors_ptr,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & filter_shape,const int8_t * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scale,int32_t * input_offset)208 inline void DepthwiseConvHybridPerChannel(
209     const DepthwiseParams& params, float* scaling_factors_ptr,
210     const RuntimeShape& input_shape, const int8_t* input_data,
211     const RuntimeShape& filter_shape, const int8_t* filter_data,
212     const RuntimeShape& bias_shape, const float* bias_data,
213     const RuntimeShape& output_shape, float* output_data,
214     const float* per_channel_scale, int32_t* input_offset) {
215   const int stride_width = params.stride_width;
216   const int stride_height = params.stride_height;
217   const int dilation_width_factor = params.dilation_width_factor;
218   const int dilation_height_factor = params.dilation_height_factor;
219   const int pad_width = params.padding_values.width;
220   const int pad_height = params.padding_values.height;
221   const int depth_multiplier = params.depth_multiplier;
222   const float output_activation_min = params.float_activation_min;
223   const float output_activation_max = params.float_activation_max;
224   // Check dimensions of the tensors.
225   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
226   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
227   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
228 
229   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
230   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
231   const int input_height = input_shape.Dims(1);
232   const int input_width = input_shape.Dims(2);
233   const int input_depth = input_shape.Dims(3);
234   const int filter_height = filter_shape.Dims(1);
235   const int filter_width = filter_shape.Dims(2);
236   const int output_height = output_shape.Dims(1);
237   const int output_width = output_shape.Dims(2);
238   const int bias_depth = bias_shape.FlatSize();
239   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
240   TFLITE_DCHECK_EQ(bias_depth, output_depth);
241 
242   for (int batch = 0; batch < batches; ++batch) {
243     for (int out_y = 0; out_y < output_height; ++out_y) {
244       for (int out_x = 0; out_x < output_width; ++out_x) {
245         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
246           for (int m = 0; m < depth_multiplier; ++m) {
247             const int output_channel = m + in_channel * depth_multiplier;
248             const int in_x_origin = (out_x * stride_width) - pad_width;
249             const int in_y_origin = (out_y * stride_height) - pad_height;
250             int32_t acc = 0;
251             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
252               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
253                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
254                 const int in_y =
255                     in_y_origin + dilation_height_factor * filter_y;
256                 // Zero padding by omitting the areas outside the image.
257                 const bool is_point_inside_image =
258                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
259                     (in_y < input_height);
260                 if (is_point_inside_image) {
261                   int32_t input_val = input_data[Offset(
262                       input_shape, batch, in_y, in_x, in_channel)];
263                   int32_t filter_val = filter_data[Offset(
264                       filter_shape, 0, filter_y, filter_x, output_channel)];
265                   acc += filter_val * (input_val - input_offset[batch]);
266                 }
267               }
268             }
269             float acc_float = static_cast<float>(acc);
270             acc_float *=
271                 per_channel_scale[output_channel] * scaling_factors_ptr[batch];
272             if (bias_data && output_channel < bias_depth) {
273               acc_float += bias_data[output_channel];
274             }
275             output_data[Offset(output_shape, batch, out_y, out_x,
276                                output_channel)] =
277                 ActivationFunctionWithMinMax(acc_float, output_activation_min,
278                                              output_activation_max);
279           }
280         }
281       }
282     }
283   }
284 }
285 
286 }  // namespace reference_integer_ops
287 }  // namespace tflite
288 
289 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
290