1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
17
18 #include <string.h>
19
20 #include <algorithm>
21
22 #include "ruy/profiler/instrumentation.h" // from @ruy
23 #include "tensorflow/lite/kernels/internal/compatibility.h"
24 #include "tensorflow/lite/kernels/internal/cppmath.h"
25 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
26 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
27 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
28 #include "tensorflow/lite/kernels/internal/quantization_util.h"
29 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
30 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
31 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
32 #include "tensorflow/lite/kernels/internal/types.h"
33
34 namespace tflite {
35 namespace optimized_integer_ops {
36
MaxPool(const PoolParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & output_shape,int8 * output_data)37 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
38 const int8* input_data, const RuntimeShape& output_shape,
39 int8* output_data) {
40 ruy::profiler::ScopeLabel label("MaxPool/8bit");
41
42 // Here, and in other pooling ops, in order to maintain locality of reference,
43 // to minimize some recalculations, and to load into NEON vector registers, we
44 // use an inner loop down the depth. Since depths can be large and hence we
45 // would need arbitrarily large temporary storage, we divide the work up into
46 // depth tranches just within the batch loop.
47 static constexpr int kPoolingAccTrancheSize = 256;
48
49 TFLITE_DCHECK_LE(params.quantized_activation_min,
50 params.quantized_activation_max);
51 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
52 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
53 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
54 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
55 const int input_height = input_shape.Dims(1);
56 const int input_width = input_shape.Dims(2);
57 const int output_height = output_shape.Dims(1);
58 const int output_width = output_shape.Dims(2);
59 const int stride_height = params.stride_height;
60 const int stride_width = params.stride_width;
61
62 int8 acc[kPoolingAccTrancheSize];
63 for (int batch = 0; batch < batches; ++batch) {
64 // We proceed through the depth in tranches (see comment above). The
65 // depth_base is the depth at the beginning of the tranche. The
66 // tranche_depth is the depth dimension of the tranche.
67 for (int depth_base = 0; depth_base < depth;
68 depth_base += kPoolingAccTrancheSize) {
69 const int tranche_depth =
70 std::min(depth - depth_base, kPoolingAccTrancheSize);
71 for (int out_y = 0; out_y < output_height; ++out_y) {
72 for (int out_x = 0; out_x < output_width; ++out_x) {
73 const int in_x_origin =
74 (out_x * stride_width) - params.padding_values.width;
75 const int in_y_origin =
76 (out_y * stride_height) - params.padding_values.height;
77 const int filter_x_start = std::max(0, -in_x_origin);
78 const int filter_x_end =
79 std::min(params.filter_width, input_width - in_x_origin);
80 const int filter_y_start = std::max(0, -in_y_origin);
81 const int filter_y_end =
82 std::min(params.filter_height, input_height - in_y_origin);
83 memset(acc, params.quantized_activation_min,
84 tranche_depth * sizeof(acc[0]));
85 const int8* input_ptr =
86 input_data + depth_base +
87 depth * (in_x_origin +
88 input_width * (in_y_origin + input_height * batch));
89 for (int fy = filter_y_start; fy < filter_y_end; fy++) {
90 const int8* input_row_ptr =
91 input_ptr + depth * (fy * input_width + filter_x_start);
92 for (int fx = filter_x_start; fx < filter_x_end; fx++) {
93 const int8* input_channel_ptr = input_row_ptr;
94 int channel = 0;
95 #ifdef USE_NEON
96 for (; channel <= tranche_depth - 16; channel += 16) {
97 int8x16_t acc_reg = vld1q_s8(acc + channel);
98 int8x16_t input_reg = vld1q_s8(input_channel_ptr);
99 input_channel_ptr += 16;
100 acc_reg = vmaxq_s8(acc_reg, input_reg);
101 vst1q_s8(acc + channel, acc_reg);
102 }
103
104 for (; channel <= tranche_depth - 8; channel += 8) {
105 int8x8_t acc_reg = vld1_s8(acc + channel);
106 int8x8_t input_reg = vld1_s8(input_channel_ptr);
107 input_channel_ptr += 8;
108 acc_reg = vmax_s8(acc_reg, input_reg);
109 vst1_s8(acc + channel, acc_reg);
110 }
111 #endif
112 for (; channel < tranche_depth; ++channel) {
113 acc[channel] = std::max(acc[channel], *input_channel_ptr++);
114 }
115 input_row_ptr += depth;
116 }
117 }
118 int8* output_ptr = output_data + Offset(output_shape, batch, out_y,
119 out_x, depth_base);
120 int channel = 0;
121 #ifdef USE_NEON
122 for (; channel <= tranche_depth - 16; channel += 16) {
123 int8x16_t a = vld1q_s8(acc + channel);
124 a = vminq_s8(a, vdupq_n_s8(params.quantized_activation_max));
125 a = vmaxq_s8(a, vdupq_n_s8(params.quantized_activation_min));
126 vst1q_s8(output_ptr + channel, a);
127 }
128 for (; channel <= tranche_depth - 8; channel += 8) {
129 int8x8_t a = vld1_s8(acc + channel);
130 a = vmin_s8(a, vdup_n_s8(params.quantized_activation_max));
131 a = vmax_s8(a, vdup_n_s8(params.quantized_activation_min));
132 vst1_s8(output_ptr + channel, a);
133 }
134 #endif
135 for (; channel < tranche_depth; ++channel) {
136 int8 a = acc[channel];
137 a = std::max<int8>(a, params.quantized_activation_min);
138 a = std::min<int8>(a, params.quantized_activation_max);
139 output_ptr[channel] = static_cast<int8>(a);
140 }
141 }
142 }
143 }
144 }
145 }
146
AveragePool(const PoolParams & params,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & output_shape,int8 * output_data)147 inline void AveragePool(const PoolParams& params,
148 const RuntimeShape& input_shape, const int8* input_data,
149 const RuntimeShape& output_shape, int8* output_data) {
150 ruy::profiler::ScopeLabel label("AveragePool/8bitWith32bitAccumulator");
151
152 // Here, and in other pooling ops, in order to maintain locality of reference,
153 // to minimize some recalculations, and to load into NEON vector registers, we
154 // use an inner loop down the depth. Since depths can be large and hence we
155 // would need arbitrarily large temporary storage, we divide the work up into
156 // depth tranches just within the batch loop.
157 static constexpr int kPoolingAccTrancheSize = 256;
158
159 TFLITE_DCHECK_LE(params.quantized_activation_min,
160 params.quantized_activation_max);
161 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
162 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
163 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
164 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
165 const int input_height = input_shape.Dims(1);
166 const int input_width = input_shape.Dims(2);
167 const int output_height = output_shape.Dims(1);
168 const int output_width = output_shape.Dims(2);
169 const int stride_height = params.stride_height;
170 const int stride_width = params.stride_width;
171
172 int32 acc[kPoolingAccTrancheSize];
173 for (int batch = 0; batch < batches; ++batch) {
174 // We proceed through the depth in tranches (see comment above). The
175 // depth_base is the depth at the beginning of the tranche. The
176 // tranche_depth is the depth dimension of the tranche.
177 for (int depth_base = 0; depth_base < depth;
178 depth_base += kPoolingAccTrancheSize) {
179 const int tranche_depth =
180 std::min(depth - depth_base, kPoolingAccTrancheSize);
181 for (int out_y = 0; out_y < output_height; ++out_y) {
182 for (int out_x = 0; out_x < output_width; ++out_x) {
183 const int in_x_origin =
184 (out_x * stride_width) - params.padding_values.width;
185 const int in_y_origin =
186 (out_y * stride_height) - params.padding_values.height;
187 const int filter_x_start = std::max(0, -in_x_origin);
188 const int filter_x_end =
189 std::min(params.filter_width, input_width - in_x_origin);
190 const int filter_y_start = std::max(0, -in_y_origin);
191 const int filter_y_end =
192 std::min(params.filter_height, input_height - in_y_origin);
193 const int filter_count =
194 (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
195 memset(acc, 0, tranche_depth * sizeof(acc[0]));
196 const int8* input_ptr =
197 input_data + depth_base +
198 depth * (in_x_origin +
199 input_width * (in_y_origin + input_height * batch));
200 for (int fy = filter_y_start; fy < filter_y_end; fy++) {
201 const int8* input_row_ptr =
202 input_ptr + depth * (fy * input_width + filter_x_start);
203 for (int fx = filter_x_start; fx < filter_x_end; fx++) {
204 const int8* input_channel_ptr = input_row_ptr;
205 int channel = 0;
206 #ifdef USE_NEON
207 for (; channel <= tranche_depth - 16; channel += 16) {
208 int16x4_t acc_reg[4];
209 int8x16_t input_reg = vld1q_s8(input_channel_ptr);
210 input_channel_ptr += 16;
211 acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
212 acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
213 acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
214 acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
215 for (int i = 0; i < 4; i++) {
216 vst1q_s32(
217 acc + channel + 4 * i,
218 vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
219 }
220 }
221 for (; channel <= tranche_depth - 8; channel += 8) {
222 int16x4_t acc_reg[2];
223 int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
224 input_channel_ptr += 8;
225 acc_reg[0] = vget_low_s16(input_reg);
226 acc_reg[1] = vget_high_s16(input_reg);
227 for (int i = 0; i < 2; i++) {
228 vst1q_s32(
229 acc + channel + 4 * i,
230 vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
231 }
232 }
233 #endif
234 for (; channel < tranche_depth; ++channel) {
235 acc[channel] += *input_channel_ptr++;
236 }
237 input_row_ptr += depth;
238 }
239 }
240 int8* output_ptr = output_data + Offset(output_shape, batch, out_y,
241 out_x, depth_base);
242 int channel = 0;
243 #ifdef USE_NEON
244 for (; channel <= tranche_depth - 8; channel += 8) {
245 int16 buf[8];
246 for (int i = 0; i < 8; i++) {
247 buf[i] =
248 acc[channel + i] > 0
249 ? (acc[channel + i] + filter_count / 2) / filter_count
250 : (acc[channel + i] - filter_count / 2) / filter_count;
251 }
252 int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
253 buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
254 buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
255 vst1_s8(output_ptr + channel, buf8);
256 }
257 #endif
258 for (; channel < tranche_depth; ++channel) {
259 int16 a = acc[channel] > 0
260 ? (acc[channel] + filter_count / 2) / filter_count
261 : (acc[channel] - filter_count / 2) / filter_count;
262 a = std::max<int16>(a, params.quantized_activation_min);
263 a = std::min<int16>(a, params.quantized_activation_max);
264 output_ptr[channel] = static_cast<int8>(a);
265 }
266 }
267 }
268 }
269 }
270 }
271
272 } // namespace optimized_integer_ops
273 } // namespace tflite
274
275 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
276