• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
17 
18 #include <string.h>
19 
20 #include <algorithm>
21 
22 #include "ruy/profiler/instrumentation.h"  // from @ruy
23 #include "tensorflow/lite/kernels/internal/compatibility.h"
24 #include "tensorflow/lite/kernels/internal/cppmath.h"
25 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
26 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
27 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
28 #include "tensorflow/lite/kernels/internal/quantization_util.h"
29 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
30 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
31 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
32 #include "tensorflow/lite/kernels/internal/types.h"
33 
34 namespace tflite {
35 namespace optimized_integer_ops {
36 
MaxPool(const PoolParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & output_shape,int8 * output_data)37 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
38                     const int8* input_data, const RuntimeShape& output_shape,
39                     int8* output_data) {
40   ruy::profiler::ScopeLabel label("MaxPool/8bit");
41 
42   // Here, and in other pooling ops, in order to maintain locality of reference,
43   // to minimize some recalculations, and to load into NEON vector registers, we
44   // use an inner loop down the depth. Since depths can be large and hence we
45   // would need arbitrarily large temporary storage, we divide the work up into
46   // depth tranches just within the batch loop.
47   static constexpr int kPoolingAccTrancheSize = 256;
48 
49   TFLITE_DCHECK_LE(params.quantized_activation_min,
50                    params.quantized_activation_max);
51   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
52   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
53   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
54   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
55   const int input_height = input_shape.Dims(1);
56   const int input_width = input_shape.Dims(2);
57   const int output_height = output_shape.Dims(1);
58   const int output_width = output_shape.Dims(2);
59   const int stride_height = params.stride_height;
60   const int stride_width = params.stride_width;
61 
62   int8 acc[kPoolingAccTrancheSize];
63   for (int batch = 0; batch < batches; ++batch) {
64     // We proceed through the depth in tranches (see comment above). The
65     // depth_base is the depth at the beginning of the tranche. The
66     // tranche_depth is the depth dimension of the tranche.
67     for (int depth_base = 0; depth_base < depth;
68          depth_base += kPoolingAccTrancheSize) {
69       const int tranche_depth =
70           std::min(depth - depth_base, kPoolingAccTrancheSize);
71       for (int out_y = 0; out_y < output_height; ++out_y) {
72         for (int out_x = 0; out_x < output_width; ++out_x) {
73           const int in_x_origin =
74               (out_x * stride_width) - params.padding_values.width;
75           const int in_y_origin =
76               (out_y * stride_height) - params.padding_values.height;
77           const int filter_x_start = std::max(0, -in_x_origin);
78           const int filter_x_end =
79               std::min(params.filter_width, input_width - in_x_origin);
80           const int filter_y_start = std::max(0, -in_y_origin);
81           const int filter_y_end =
82               std::min(params.filter_height, input_height - in_y_origin);
83           memset(acc, params.quantized_activation_min,
84                  tranche_depth * sizeof(acc[0]));
85           const int8* input_ptr =
86               input_data + depth_base +
87               depth * (in_x_origin +
88                        input_width * (in_y_origin + input_height * batch));
89           for (int fy = filter_y_start; fy < filter_y_end; fy++) {
90             const int8* input_row_ptr =
91                 input_ptr + depth * (fy * input_width + filter_x_start);
92             for (int fx = filter_x_start; fx < filter_x_end; fx++) {
93               const int8* input_channel_ptr = input_row_ptr;
94               int channel = 0;
95 #ifdef USE_NEON
96               for (; channel <= tranche_depth - 16; channel += 16) {
97                 int8x16_t acc_reg = vld1q_s8(acc + channel);
98                 int8x16_t input_reg = vld1q_s8(input_channel_ptr);
99                 input_channel_ptr += 16;
100                 acc_reg = vmaxq_s8(acc_reg, input_reg);
101                 vst1q_s8(acc + channel, acc_reg);
102               }
103 
104               for (; channel <= tranche_depth - 8; channel += 8) {
105                 int8x8_t acc_reg = vld1_s8(acc + channel);
106                 int8x8_t input_reg = vld1_s8(input_channel_ptr);
107                 input_channel_ptr += 8;
108                 acc_reg = vmax_s8(acc_reg, input_reg);
109                 vst1_s8(acc + channel, acc_reg);
110               }
111 #endif
112               for (; channel < tranche_depth; ++channel) {
113                 acc[channel] = std::max(acc[channel], *input_channel_ptr++);
114               }
115               input_row_ptr += depth;
116             }
117           }
118           int8* output_ptr = output_data + Offset(output_shape, batch, out_y,
119                                                   out_x, depth_base);
120           int channel = 0;
121 #ifdef USE_NEON
122           for (; channel <= tranche_depth - 16; channel += 16) {
123             int8x16_t a = vld1q_s8(acc + channel);
124             a = vminq_s8(a, vdupq_n_s8(params.quantized_activation_max));
125             a = vmaxq_s8(a, vdupq_n_s8(params.quantized_activation_min));
126             vst1q_s8(output_ptr + channel, a);
127           }
128           for (; channel <= tranche_depth - 8; channel += 8) {
129             int8x8_t a = vld1_s8(acc + channel);
130             a = vmin_s8(a, vdup_n_s8(params.quantized_activation_max));
131             a = vmax_s8(a, vdup_n_s8(params.quantized_activation_min));
132             vst1_s8(output_ptr + channel, a);
133           }
134 #endif
135           for (; channel < tranche_depth; ++channel) {
136             int8 a = acc[channel];
137             a = std::max<int8>(a, params.quantized_activation_min);
138             a = std::min<int8>(a, params.quantized_activation_max);
139             output_ptr[channel] = static_cast<int8>(a);
140           }
141         }
142       }
143     }
144   }
145 }
146 
AveragePool(const PoolParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & output_shape,int8 * output_data)147 inline void AveragePool(const PoolParams& params,
148                         const RuntimeShape& input_shape, const int8* input_data,
149                         const RuntimeShape& output_shape, int8* output_data) {
150   ruy::profiler::ScopeLabel label("AveragePool/8bitWith32bitAccumulator");
151 
152   // Here, and in other pooling ops, in order to maintain locality of reference,
153   // to minimize some recalculations, and to load into NEON vector registers, we
154   // use an inner loop down the depth. Since depths can be large and hence we
155   // would need arbitrarily large temporary storage, we divide the work up into
156   // depth tranches just within the batch loop.
157   static constexpr int kPoolingAccTrancheSize = 256;
158 
159   TFLITE_DCHECK_LE(params.quantized_activation_min,
160                    params.quantized_activation_max);
161   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
162   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
163   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
164   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
165   const int input_height = input_shape.Dims(1);
166   const int input_width = input_shape.Dims(2);
167   const int output_height = output_shape.Dims(1);
168   const int output_width = output_shape.Dims(2);
169   const int stride_height = params.stride_height;
170   const int stride_width = params.stride_width;
171 
172   int32 acc[kPoolingAccTrancheSize];
173   for (int batch = 0; batch < batches; ++batch) {
174     // We proceed through the depth in tranches (see comment above). The
175     // depth_base is the depth at the beginning of the tranche. The
176     // tranche_depth is the depth dimension of the tranche.
177     for (int depth_base = 0; depth_base < depth;
178          depth_base += kPoolingAccTrancheSize) {
179       const int tranche_depth =
180           std::min(depth - depth_base, kPoolingAccTrancheSize);
181       for (int out_y = 0; out_y < output_height; ++out_y) {
182         for (int out_x = 0; out_x < output_width; ++out_x) {
183           const int in_x_origin =
184               (out_x * stride_width) - params.padding_values.width;
185           const int in_y_origin =
186               (out_y * stride_height) - params.padding_values.height;
187           const int filter_x_start = std::max(0, -in_x_origin);
188           const int filter_x_end =
189               std::min(params.filter_width, input_width - in_x_origin);
190           const int filter_y_start = std::max(0, -in_y_origin);
191           const int filter_y_end =
192               std::min(params.filter_height, input_height - in_y_origin);
193           const int filter_count =
194               (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
195           memset(acc, 0, tranche_depth * sizeof(acc[0]));
196           const int8* input_ptr =
197               input_data + depth_base +
198               depth * (in_x_origin +
199                        input_width * (in_y_origin + input_height * batch));
200           for (int fy = filter_y_start; fy < filter_y_end; fy++) {
201             const int8* input_row_ptr =
202                 input_ptr + depth * (fy * input_width + filter_x_start);
203             for (int fx = filter_x_start; fx < filter_x_end; fx++) {
204               const int8* input_channel_ptr = input_row_ptr;
205               int channel = 0;
206 #ifdef USE_NEON
207               for (; channel <= tranche_depth - 16; channel += 16) {
208                 int16x4_t acc_reg[4];
209                 int8x16_t input_reg = vld1q_s8(input_channel_ptr);
210                 input_channel_ptr += 16;
211                 acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
212                 acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
213                 acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
214                 acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
215                 for (int i = 0; i < 4; i++) {
216                   vst1q_s32(
217                       acc + channel + 4 * i,
218                       vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
219                 }
220               }
221               for (; channel <= tranche_depth - 8; channel += 8) {
222                 int16x4_t acc_reg[2];
223                 int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
224                 input_channel_ptr += 8;
225                 acc_reg[0] = vget_low_s16(input_reg);
226                 acc_reg[1] = vget_high_s16(input_reg);
227                 for (int i = 0; i < 2; i++) {
228                   vst1q_s32(
229                       acc + channel + 4 * i,
230                       vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
231                 }
232               }
233 #endif
234               for (; channel < tranche_depth; ++channel) {
235                 acc[channel] += *input_channel_ptr++;
236               }
237               input_row_ptr += depth;
238             }
239           }
240           int8* output_ptr = output_data + Offset(output_shape, batch, out_y,
241                                                   out_x, depth_base);
242           int channel = 0;
243 #ifdef USE_NEON
244           for (; channel <= tranche_depth - 8; channel += 8) {
245             int16 buf[8];
246             for (int i = 0; i < 8; i++) {
247               buf[i] =
248                   acc[channel + i] > 0
249                       ? (acc[channel + i] + filter_count / 2) / filter_count
250                       : (acc[channel + i] - filter_count / 2) / filter_count;
251             }
252             int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
253             buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
254             buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
255             vst1_s8(output_ptr + channel, buf8);
256           }
257 #endif
258           for (; channel < tranche_depth; ++channel) {
259             int16 a = acc[channel] > 0
260                           ? (acc[channel] + filter_count / 2) / filter_count
261                           : (acc[channel] - filter_count / 2) / filter_count;
262             a = std::max<int16>(a, params.quantized_activation_min);
263             a = std::min<int16>(a, params.quantized_activation_max);
264             output_ptr[channel] = static_cast<int8>(a);
265           }
266         }
267       }
268     }
269   }
270 }
271 
272 }  // namespace optimized_integer_ops
273 }  // namespace tflite
274 
275 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
276