1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
17
18 // This file provides kernel implementations that are not used in shipped
19 // inference code, but rather (a) show how model C++ code is designed and then
20 // transformed into asm code, and (b) aid with maintenance and later development
21 // of variations. Many projects (even including, say, the classic NAG libraries)
22 // develop highly optimized code, but do not maintain intermediate versions.
23 // Often the result is incomprehensible final-version code.
24
25 #include <algorithm>
26
27 #include "tensorflow/lite/kernels/internal/compatibility.h"
28 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
29 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
30 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
31 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
32 #include "tensorflow/lite/kernels/internal/types.h"
33
34 namespace tflite {
35 namespace optimized_ops {
36 namespace depthwise_conv {
37
38 #ifdef USE_NEON
39
util_vst1_x8(uint8 * data_addr,int8x8_t reg)40 inline void util_vst1_x8(uint8* data_addr, int8x8_t reg) {
41 return vst1_u8(data_addr, vreinterpret_u8_s8(reg));
42 }
util_vst1_x8(int8 * data_addr,int8x8_t reg)43 inline void util_vst1_x8(int8* data_addr, int8x8_t reg) {
44 return vst1_s8(data_addr, reg);
45 }
46
47 // Lane operations are for clarity and convenience. We want to load and store
48 // 4 8-bit lanes together. So these are treated much like 32-bit loads and
49 // 32-bit stores. Stores require 32-bit alignment.
50
51 #define vst1_lane_8x4(dst, reg, lane_num) \
52 TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
53 vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
54 #define vst1q_lane_8x4(dst, reg, lane_num) \
55 TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
56 vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
57
58 // Important! Most compilation configurations will compile and run without
59 // reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
60 // obscure bug or mis-feature probably in unhygienic macro expansion.
61 #define vld1q_lane_s8x8(src, reg, lane_num) \
62 vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
63 #define vld1_lane_8x4(src, reg, lane_num) \
64 vld1_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
65 #define vld1q_lane_8x4(src, reg, lane_num) \
66 vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
67 #define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
68
69 #endif // USE_NEON
70
71 template <QuantizationType quantization_type>
72 struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
73 quantization_type> {
74 // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
75 // width 3, sub-block 0 or 1, depth 4. Filter data is written as
76 // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
77 //
78 // Note that this rearrangement is much like that performed on input data when
79 // filling the workspace, and optimized versions will be similar.
80 static inline void FillFilterBank(int depth, const uint8* filter_block,
81 int8 filter_bank[3][2][4][4]) {
82 constexpr int kSymmetricZeroPoint =
83 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
84 // Load filter data in, 8-bytes down depth / sub-block at a time.
85 //
86 // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
87 // depth 4.
88 uint8 loaded_filter[3][4][2][4];
89 for (int y = 0; y < 3; ++y) {
90 for (int x = 0; x < 3; ++x) {
91 memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
92 8);
93 }
94 // Pad the filter with symmetric representation of 0, so that the values
95 // become 0 when the zero-poing is added below. Thus these filter taps are
96 // effectively disregarded in later filtering.
97 memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
98 }
99 for (int y = 0; y < 3; ++y) {
100 for (int z = 0; z < 4; ++z) {
101 for (int x = 0; x < 4; ++x) {
102 filter_bank[y][0][z][x] =
103 loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
104 filter_bank[y][1][z][x] =
105 loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
106 }
107 }
108 }
109 }
110
111 // Adjust the bias (weights) data according to the input offset.
112 //
113 // The output calculation is
114 // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
115 // (filter[i][j][d] + filter_offset)
116 // (where offsets are expressed as differences from 128).
117 //
118 // Since we cannot efficiently handle varying offsets / bias across the image,
119 // we insist on filter_offset = 0.
120 //
121 // This function calculates
122 // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
123 // which accounts for input offset. If the bias is constant over the depth,
124 // the adjusted bias will vary.
125 static inline void AdjustBias(int32 input_offset,
126 const int8 filter_bank[3][2][4][4],
127 const int32* bias_data,
128 int32 adjusted_bias_block[2][4]) {
129 constexpr int kSymmetricZeroPoint =
130 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
131 TFLITE_DCHECK_GE(input_offset, -255);
132 TFLITE_DCHECK_LE(input_offset, 0);
133 // For instance, if input_offset == 128, no adjustment is needed.
134 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
135
136 for (int s = 0; s < 2; ++s) {
137 for (int z = 0; z < 4; ++z) {
138 adjusted_bias_block[s][z] = bias_data[4 * s + z];
139 for (int i = 0; i < 9; ++i) {
140 adjusted_bias_block[s][z] +=
141 input_offset_difference * filter_bank[i % 3][s][z][i / 3];
142 }
143 }
144 }
145 }
146
147 static void Run(const uint8* filter_data, const int32* bias_data,
148 int8* shuffled_filter_data, int32* adjusted_bias_data,
149 const DepthwiseConvDotProdParams* function_params) {
150 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
151 const int depth = function_params->output_depth;
152 const int depth_micro_repeats = function_params->depth_micro_repeats;
153 const int bias_increment = function_params->bias_increment;
154 const int32 input_offset = function_params->input_offset;
155
156 int8 filter_bank[3][2][4][4];
157 int32 adjusted_bias_block[2][4];
158
159 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
160 FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
161 AdjustBias(input_offset, filter_bank,
162 bias_data + 2 * bias_increment * j_depth, adjusted_bias_block);
163
164 memcpy(shuffled_filter_data, filter_bank[0][0][0],
165 shuffled_filter_increment);
166 shuffled_filter_data += shuffled_filter_increment;
167 memcpy(adjusted_bias_data, adjusted_bias_block[0],
168 8 * sizeof(adjusted_bias_block[0][0]));
169 adjusted_bias_data += 8;
170 }
171 }
172 };
173
174 template <QuantizationType quantization_type>
175 struct ProcessPerDepth<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
176 quantization_type> {
177 static inline void Run(const uint8* filter_data, const int32* bias_data,
178 int8* shuffled_filter_data, int32* adjusted_bias_data,
179 const DepthwiseConvDotProdParams* function_params) {
180 const int depth = function_params->output_depth;
181 const int depth_micro_repeats = function_params->depth_micro_repeats;
182 const int bias_increment = function_params->bias_increment;
183
184 // Simulate NEON-register transposition of subset of filter.
185 int8 filter_bank_a_0[4][4]; // Depth 4, width 4.
186 int8 filter_bank_a_1[4][4];
187 int8 filter_bank_a_2[4][4];
188 int8 filter_bank_b_0[4][4];
189 int8 filter_bank_b_1[4][4];
190 int8 filter_bank_b_2[4][4];
191
192 // Load filter data in, essentially dropping the [depth/8] dimension, which
193 // is equivalent to loading just the depth needed for one micro-block.
194 //
195 // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
196 // depth 4.
197 uint8 loaded_filter_0[4][2][4];
198 uint8 loaded_filter_1[4][2][4];
199 uint8 loaded_filter_2[4][2][4];
200
201 constexpr int kSymmetricZeroPoint =
202 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
203 const int32 input_offset = function_params->input_offset;
204 TFLITE_DCHECK_GE(input_offset, -255);
205 TFLITE_DCHECK_LE(input_offset, 0);
206 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
207
208 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
209 const uint8* filter_block = filter_data + 8 * j_depth;
210
211 // Filter data is provided as filter_block[3][3][depth/8][2][4].
212 // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
213 // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
214 for (int x = 0; x < 3; ++x) {
215 memcpy(loaded_filter_0[x][0], &filter_block[3 * 0 * depth + x * depth],
216 8);
217 memcpy(loaded_filter_1[x][0], &filter_block[3 * 1 * depth + x * depth],
218 8);
219 memcpy(loaded_filter_2[x][0], &filter_block[3 * 2 * depth + x * depth],
220 8);
221 }
222 // Pad the filter with -filter_offset, so that the values become 0 when
223 // the filter_offset is later added, and so the filter tap is effectively
224 // disregarded.
225 memset(loaded_filter_0[3][0], kSymmetricZeroPoint, 8);
226 memset(loaded_filter_1[3][0], kSymmetricZeroPoint, 8);
227 memset(loaded_filter_2[3][0], kSymmetricZeroPoint, 8);
228
229 for (int z = 0; z < 4; ++z) {
230 for (int x = 0; x < 4; ++x) {
231 filter_bank_a_0[z][x] =
232 loaded_filter_0[x][0][z] - kSymmetricZeroPoint;
233 filter_bank_b_0[z][x] =
234 loaded_filter_0[x][1][z] - kSymmetricZeroPoint;
235 filter_bank_a_1[z][x] =
236 loaded_filter_1[x][0][z] - kSymmetricZeroPoint;
237 filter_bank_b_1[z][x] =
238 loaded_filter_1[x][1][z] - kSymmetricZeroPoint;
239 filter_bank_a_2[z][x] =
240 loaded_filter_2[x][0][z] - kSymmetricZeroPoint;
241 filter_bank_b_2[z][x] =
242 loaded_filter_2[x][1][z] - kSymmetricZeroPoint;
243 }
244 }
245
246 memcpy(shuffled_filter_data, filter_bank_a_0, 16);
247 shuffled_filter_data += 16;
248 memcpy(shuffled_filter_data, filter_bank_b_0, 16);
249 shuffled_filter_data += 16;
250 memcpy(shuffled_filter_data, filter_bank_a_1, 16);
251 shuffled_filter_data += 16;
252 memcpy(shuffled_filter_data, filter_bank_b_1, 16);
253 shuffled_filter_data += 16;
254 memcpy(shuffled_filter_data, filter_bank_a_2, 16);
255 shuffled_filter_data += 16;
256 memcpy(shuffled_filter_data, filter_bank_b_2, 16);
257 shuffled_filter_data += 16;
258
259 int32 adjusted_bias_data_0[4];
260 int32 adjusted_bias_data_1[4];
261 // For instance, if input_offset == 128, no adjustment is needed.
262 for (int z = 0; z < 4; ++z) {
263 adjusted_bias_data_0[z] = bias_data[z];
264 adjusted_bias_data_1[z] = bias_data[4 + z];
265 for (int x = 0; x < 4; ++x) {
266 adjusted_bias_data_0[z] +=
267 input_offset_difference * filter_bank_a_0[z][x];
268 adjusted_bias_data_0[z] +=
269 input_offset_difference * filter_bank_a_1[z][x];
270 adjusted_bias_data_0[z] +=
271 input_offset_difference * filter_bank_a_2[z][x];
272 adjusted_bias_data_1[z] +=
273 input_offset_difference * filter_bank_b_0[z][x];
274 adjusted_bias_data_1[z] +=
275 input_offset_difference * filter_bank_b_1[z][x];
276 adjusted_bias_data_1[z] +=
277 input_offset_difference * filter_bank_b_2[z][x];
278
279 adjusted_bias_data[z] = adjusted_bias_data_0[z];
280 adjusted_bias_data[4 + z] = adjusted_bias_data_1[z];
281 }
282 }
283 bias_data += 2 * bias_increment;
284 adjusted_bias_data += 8;
285 }
286 }
287 };
288
289 #ifdef USE_NEON
290 template <QuantizationType quantization_type>
291 struct ProcessPerDepth<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
292 quantization_type> {
293 static void ProcessPerDepthIntrinsics(
294 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
295 filter_data,
296 const int32* bias_data, int8* shuffled_filter_data,
297 int32* adjusted_bias_data,
298 const DepthwiseConvDotProdParams* function_params) {
299 const int depth = function_params->output_depth;
300 const int depth_micro_repeats = function_params->depth_micro_repeats;
301 const int bias_increment = function_params->bias_increment;
302
303 constexpr int kSymmetricZeroPoint =
304 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
305 constexpr uint8 kSignBit =
306 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
307 const int32 input_offset = function_params->input_offset;
308 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
309 TFLITE_DCHECK_GE(input_offset, -255);
310 TFLITE_DCHECK_LE(input_offset, 0);
311 }
312 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
313 const int8x16_t ones_vector = vdupq_n_s8(1);
314
315 // Simulate NEON-register transposition of subset of filter.
316 int8x16_t input_0_a;
317 int8x16_t input_0_b;
318 int8x16_t input_0_c;
319 int8x16_t input_1_a;
320 int8x16_t input_1_b;
321 int8x16_t input_1_c;
322 int8x16_t input_2_a;
323 int8x16_t input_2_b;
324 int8x16_t input_2_c;
325
326 int8x16_t filter_0_a;
327 int8x16_t filter_0_b;
328 int8x16_t filter_1_a;
329 int8x16_t filter_1_b;
330 int8x16_t filter_2_a;
331 int8x16_t filter_2_b;
332
333 // For uint8, effect subtraction of zero-point = 128 by XOR of sign bit.
334 const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
335
336 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
337 filter_block = filter_data;
338 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
339 // Filter data is provided as filter_block[3][3][depth/8][2][4].
340 // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
341 // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
342
343 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
344 filter_block_ptr = filter_block;
345 input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
346 filter_block_ptr += depth;
347 input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
348 filter_block_ptr += depth;
349 input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
350 filter_block_ptr += depth;
351 input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
352 filter_block_ptr += depth;
353 input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
354 filter_block_ptr += depth;
355 input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
356 filter_block_ptr += depth;
357 input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
358 filter_block_ptr += depth;
359 input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
360 filter_block_ptr += depth;
361 input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
362
363 filter_0_a = vzip1q_s8(input_0_a, input_0_b);
364 filter_0_b = vzip1q_s8(input_0_c, sign_bit);
365 filter_1_a = vzip1q_s8(input_1_a, input_1_b);
366 filter_1_b = vzip1q_s8(input_1_c, sign_bit);
367 filter_2_a = vzip1q_s8(input_2_a, input_2_b);
368 filter_2_b = vzip1q_s8(input_2_c, sign_bit);
369 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
370 filter_0_a = veorq_s8(filter_0_a, sign_bit);
371 filter_0_b = veorq_s8(filter_0_b, sign_bit);
372 filter_1_a = veorq_s8(filter_1_a, sign_bit);
373 filter_1_b = veorq_s8(filter_1_b, sign_bit);
374 filter_2_a = veorq_s8(filter_2_a, sign_bit);
375 filter_2_b = veorq_s8(filter_2_b, sign_bit);
376 }
377 vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
378 vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
379 vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
380
381 vst1q_s8(shuffled_filter_data, filter_0_a);
382 shuffled_filter_data += 16;
383 vst1q_s8(shuffled_filter_data, filter_0_b);
384 shuffled_filter_data += 16;
385 vst1q_s8(shuffled_filter_data, filter_1_a);
386 shuffled_filter_data += 16;
387 vst1q_s8(shuffled_filter_data, filter_1_b);
388 shuffled_filter_data += 16;
389 vst1q_s8(shuffled_filter_data, filter_2_a);
390 shuffled_filter_data += 16;
391 vst1q_s8(shuffled_filter_data, filter_2_b);
392 shuffled_filter_data += 16;
393
394 int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
395 bias_data += bias_increment;
396 int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
397 bias_data += bias_increment;
398 // For instance, if input_offset is kIntSymmetricZeroPoint, no adjustment
399 // is needed.
400
401 int32x4_t filter_sum_a = vdupq_n_s32(0);
402 filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
403 filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
404 filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
405 int32x4_t filter_sum_b = vdupq_n_s32(0);
406 filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
407 filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
408 filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
409
410 adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
411 input_offset_difference);
412 adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
413 input_offset_difference);
414
415 vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
416 adjusted_bias_data += 4;
417 vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
418 adjusted_bias_data += 4;
419
420 filter_block += 8;
421 }
422 }
423
424 static inline void Run(const typename QuantizationTypeImpl<
425 quantization_type>::ExternalType* filter_data,
426 const int32* bias_data, int8* shuffled_filter_data,
427 int32* adjusted_bias_data,
428 const DepthwiseConvDotProdParams* function_params) {
429 ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
430 adjusted_bias_data, function_params);
431 }
432 };
433 #endif
434
435 template <QuantizationType quantization_type, int32 max_padding>
436 struct PackMacroBlock<
437 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
438 DepthwiseConvDepthMultiplication::kNoMultiplication, max_padding> {
439 // A straight copy of a macro block of input data into a scratch buffer.
440 //
441 // Requirement: depth_micro_repeats > 0.
442 static inline void CopyMacroBlock(
443 int32 height_block_number, int32 width_block_number,
444 const DepthwiseConvDotProdParams& function_params,
445 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
446 input_block_data,
447 int8* scratch_block_data) {
448 TFLITE_DCHECK_LE(max_padding, 1);
449
450 // Strides.
451 // The input depth and count of micro blocks provide the width strides.
452 const int input_height_stride = function_params.input_height_stride;
453 const int workspace_height_stride = function_params.workspace_height_stride;
454 const int input_depth = function_params.input_depth;
455 const int depth_micro_repeats = function_params.depth_micro_repeats;
456 TFLITE_DCHECK_GT(depth_micro_repeats, 0);
457
458 // Remaining iteration and dimension parameters.
459 //
460 // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
461 // final micro block is incomplete.
462 const int width_overall_micro_repeats =
463 function_params.input_width_overall_micro_repeats;
464 int input_width_micro_repeats = function_params.input_width_micro_repeats;
465 const int residual_width = function_params.residual_width;
466 const int block_height = function_params.inbound_block_height;
467
468 const int padding_left = function_params.padding_left;
469 const int padding_right = function_params.padding_right;
470 const int padding_top = function_params.padding_top;
471 const int padding_bottom = function_params.padding_bottom;
472
473 const bool leading_width_padding =
474 padding_left > 0 && width_block_number == 0;
475 const bool trailing_width_padding =
476 padding_right > 0 &&
477 width_block_number == (function_params.width_macro_count - 1);
478 const bool leading_height_padding =
479 padding_top > 0 && height_block_number < 0;
480 const bool trailing_height_padding =
481 padding_bottom > 0 &&
482 height_block_number == (function_params.height_macro_count - 1);
483
484 // Modify the trailing case to reflect the input width.
485 int input_residual_width =
486 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
487 : 4;
488 if (trailing_width_padding) {
489 input_residual_width -= 1;
490 input_width_micro_repeats = width_overall_micro_repeats - 1;
491 }
492
493 constexpr int kSymmetricZeroPoint =
494 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
495 const int32 input_offset_difference =
496 function_params.input_offset + kSymmetricZeroPoint;
497
498 // We load data into a temporary buffer and then save, to match subsequent
499 // processing. This will make it easier to combine stages into one ASM
500 // routine.
501 int8 tmp_load[4][2][4];
502
503 int copy_block_height = block_height;
504 if (leading_height_padding) {
505 memset(scratch_block_data, -input_offset_difference,
506 workspace_height_stride);
507 scratch_block_data += workspace_height_stride;
508 input_block_data += input_height_stride;
509 copy_block_height -= 1;
510 }
511 if (trailing_height_padding) {
512 copy_block_height -= 1;
513 }
514
515 // The outer 3 loops go through all the micro blocks in a macro block.
516 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
517 for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
518 // Figure out division of work (available input vs trailing padding).
519 int adjusted_residual_width =
520 j_width == input_width_micro_repeats ? input_residual_width : 4;
521
522 int start_width = 0;
523 if (leading_width_padding && j_width == 0) {
524 start_width = 1;
525 memset(tmp_load[0][0], -input_offset_difference, 8);
526 }
527 if (adjusted_residual_width < 4) {
528 for (int x = adjusted_residual_width; x < 4; ++x) {
529 memset(tmp_load[x][0], -input_offset_difference, 8);
530 }
531 }
532
533 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
534 // The inner 3 loops go through the sub-block, depth and width within
535 // each micro block.
536
537 // Load, and apply symmetric offset.
538 int8* scratch_data =
539 scratch_block_data + k_height * workspace_height_stride +
540 j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
541 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
542 input_data = input_block_data + k_height * input_height_stride +
543 j_width * 4 * input_depth + i_depth * 8;
544 // Full-size macro blocks are 2*4*4 = 32 bytes.
545 for (int x = start_width; x < adjusted_residual_width; ++x) {
546 for (int s = 0; s < 2; ++s) {
547 for (int d = 0; d < 4; ++d) {
548 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
549 kSymmetricZeroPoint;
550 }
551 }
552 }
553
554 // Save results.
555 memcpy(&scratch_data[0], tmp_load[0][0], 8);
556 memcpy(&scratch_data[8], tmp_load[1][0], 8);
557 memcpy(&scratch_data[16], tmp_load[2][0], 8);
558 memcpy(&scratch_data[24], tmp_load[3][0], 8);
559 }
560 }
561 }
562
563 if (trailing_height_padding) {
564 memset(scratch_block_data + copy_block_height * workspace_height_stride,
565 -input_offset_difference, workspace_height_stride);
566 }
567 }
568
569 // Transpose 4x4 blocks within each sub-micro-block.
570 //
571 // Implemented somewhat like NEON register manipulation, so that we can see
572 // equivalence of the two approaches.
573 static inline void MicroTransposeBlocks(
574 const DepthwiseConvDotProdParams& function_params,
575 int8* scratch_block_data) {
576 const int workspace_height_stride = function_params.workspace_height_stride;
577 const int width_overall_micro_repeats =
578 function_params.input_width_overall_micro_repeats;
579 const int depth_micro_repeats = function_params.depth_micro_repeats;
580 const int block_height = function_params.inbound_block_height;
581
582 // Transpositions are 4x4, but doing 2 at a time is more efficient in the
583 // NEON code we are simulating.
584 int8 tmp_load[4][2][4]; // [width][sub-block][depth]
585 int8 tmp_transposed[4][2][4]; // [depth][sub-block][width]
586 int8 tmp_interleaved[2][4][4]; // [sub-block][depth][width]
587
588 // The outer 3 loops go through all the micro blocks in a macro block.
589 for (int k_height = 0; k_height < block_height; ++k_height) {
590 for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
591 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
592 int8* scratch_data =
593 scratch_block_data + k_height * workspace_height_stride +
594 j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
595 // A. Load data
596 memcpy(tmp_load[0][0], &scratch_data[0], 8);
597 memcpy(tmp_load[1][0], &scratch_data[8], 8);
598 memcpy(tmp_load[2][0], &scratch_data[16], 8);
599 memcpy(tmp_load[3][0], &scratch_data[24], 8);
600
601 // B. Simulate between-register transposition.
602 for (int x = 0; x < 4; ++x) {
603 for (int y = 0; y < 4; ++y) {
604 tmp_transposed[x][0][y] = tmp_load[y][0][x];
605 tmp_transposed[x][1][y] = tmp_load[y][1][x];
606 }
607 }
608
609 // C. Simulate between-register interleaving.
610 for (int x = 0; x < 4; ++x) {
611 for (int y = 0; y < 4; ++y) {
612 tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
613 tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
614 }
615 }
616 // D. Simulate mangled storage arrangement.
617 memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
618 memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
619 }
620 }
621 }
622 }
623
624 static inline void Run(
625 int32 height_block_number, int32 width_block_number,
626 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
627 input_block_data,
628 int8* scratch_block_data,
629 const DepthwiseConvDotProdParams* function_params) {
630 CopyMacroBlock(height_block_number, width_block_number, *function_params,
631 input_block_data, scratch_block_data);
632 MicroTransposeBlocks(*function_params, scratch_block_data);
633 }
634 };
635
636 template <QuantizationType quantization_type, int32 max_padding>
637 struct PackMacroBlock<
638 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
639 DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
640 static inline void Run(
641 int32 height_block_number, int32 width_block_number,
642 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
643 input_block_data,
644 int8* scratch_block_data,
645 const DepthwiseConvDotProdParams* function_params) {
646 // Currently support for padding is limited to 1 on any side.
647 TFLITE_DCHECK_LE(max_padding, 1);
648
649 // Strides.
650 // The count of micro blocks (below) provides the width strides.
651 const int input_height_stride = function_params->input_height_stride;
652 const int workspace_height_stride =
653 function_params->workspace_height_stride;
654
655 // Remaining iteration and dimension parameters.
656 //
657 // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
658 // final micro block is incomplete.
659 const int width_overall_micro_repeats =
660 function_params->input_width_overall_micro_repeats;
661 const int input_width_micro_repeats =
662 function_params->input_width_micro_repeats;
663 const int residual_width = function_params->residual_width;
664 const int block_height = function_params->inbound_block_height;
665 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
666
667 const int padding_left = function_params->padding_left;
668 const int padding_right = function_params->padding_right;
669 const int padding_top = function_params->padding_top;
670 const int padding_bottom = function_params->padding_bottom;
671
672 const bool leading_width_padding =
673 padding_left > 0 && width_block_number == 0;
674 const bool trailing_width_padding =
675 padding_right > 0 &&
676 width_block_number == (function_params->width_macro_count - 1);
677 const bool leading_height_padding =
678 padding_top > 0 && height_block_number < 0;
679 const bool trailing_height_padding =
680 padding_bottom > 0 &&
681 height_block_number == (function_params->height_macro_count - 1);
682
683 constexpr int kSymmetricZeroPoint =
684 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
685 const int32 input_offset_difference =
686 function_params->input_offset + kSymmetricZeroPoint;
687
688 int copy_block_height = block_height;
689 if (leading_height_padding) {
690 memset(scratch_block_data, -input_offset_difference,
691 workspace_height_stride + kWorkspaceExtension);
692 scratch_block_data += workspace_height_stride;
693 input_block_data += input_height_stride;
694 copy_block_height -= 1;
695 }
696 if (trailing_height_padding) {
697 copy_block_height -= 1;
698 }
699
700 int adjusted_residual_width =
701 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
702 : 4;
703
704 if (trailing_width_padding) {
705 adjusted_residual_width -= 1;
706 }
707 int start_width = 0;
708 if (leading_width_padding) {
709 start_width = 1;
710 input_block_data += 1;
711 }
712
713 const int copy_size = (width_overall_micro_repeats - 1) * 4 +
714 adjusted_residual_width - start_width;
715
716 TFLITE_DCHECK_LE(
717 copy_size,
718 input_height_stride - width_block_number * input_width_micro_repeats);
719 // We may drop up to stride-1 of trailing input.
720 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
721
722 // When there is unit input depth, the micro-block iteration need only be
723 // through the height. The micro blocks are contiguous across the width.
724 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
725 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
726 input_data = input_block_data + k_height * input_height_stride;
727 int8* scratch_data =
728 scratch_block_data + k_height * workspace_height_stride;
729
730 // Handle leading padding. This is overwritten if there is no padding.
731 scratch_data[0] = -input_offset_difference;
732
733 memcpy(&scratch_data[start_width], input_data, copy_size);
734 for (int i = 0; i < copy_size; ++i) {
735 scratch_data[start_width + i] += -kSymmetricZeroPoint;
736 }
737
738 // Handle trailing padding, and fill in remainder of micro block.
739 memset(&scratch_data[start_width + copy_size], -input_offset_difference,
740 4 - adjusted_residual_width + kWorkspaceExtension);
741 }
742
743 if (trailing_height_padding) {
744 memset(scratch_block_data + copy_block_height * workspace_height_stride,
745 -input_offset_difference,
746 workspace_height_stride + kWorkspaceExtension);
747 }
748 }
749 };
750
751 // Beginning of code section containing intermediate code transformation.
752 //
753 // This section is only compiled when kUseUnwound3x3DotProduct versions of
754 // templated functions are selected.
755 template <QuantizationType quantization_type>
756 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
757 quantization_type,
758 DepthwiseConvDepthMultiplication::kNoMultiplication,
759 /*max_padding=*/0> {
760 static inline void Run(
761 int32 height_block_number, int32 width_block_number,
762 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
763 input_block_data,
764 int8* scratch_block_data,
765 const DepthwiseConvDotProdParams* function_params) {
766 const int workspace_height_stride =
767 function_params->workspace_height_stride;
768 const int width_overall_micro_repeats =
769 function_params->input_width_overall_micro_repeats;
770 const int input_width_micro_repeats =
771 function_params->input_width_micro_repeats;
772 const int depth_micro_repeats = function_params->depth_micro_repeats;
773 const int block_height = function_params->inbound_block_height;
774 const int residual_width = function_params->residual_width;
775 const int input_height_stride = function_params->input_height_stride;
776 const int input_depth = function_params->input_depth;
777
778 TFLITE_DCHECK_GE(depth_micro_repeats, 0);
779 constexpr int kSymmetricZeroPoint =
780 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
781 const int micro_block_size = 4 * 8;
782 const int depth_advance = width_overall_micro_repeats * micro_block_size;
783 const int width_advance =
784 micro_block_size *
785 (1 - depth_micro_repeats * width_overall_micro_repeats);
786 const int height_advance = workspace_height_stride -
787 width_overall_micro_repeats * micro_block_size;
788 const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
789
790 // Transpositions are 4x4, but doing 2 at a time is more efficient in the
791 // NEON code we are simulating. Note the blocks of 4x4 are still interleaved
792 // down the depth.
793 int8 tmp_load[4][2][4];
794 int8 tmp_transposed[4][2][4];
795 int8 tmp_interleaved[2][4][4];
796
797 // Work through one slice, by row, at a time.
798 int8* scratch_data = scratch_block_data;
799 for (int k_height = 0; k_height < block_height; ++k_height) {
800 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
801 input_data = input_block_data;
802 input_block_data += input_height_stride;
803
804 // Traverse the width one point at a time, but the depth in (micro) blocks
805 // of size 8.
806 //
807 // The depth and width margins, which are filled with "zeros", may be
808 // larger than is strictly needed to calculate output. This is because the
809 // conv calculation is performed across complete micro blocks.
810 for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
811 // Load, then zero.
812 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
813 // A. Simulate register loading.
814 for (int x = 0; x < 4; ++x) {
815 for (int s = 0; s < 2; ++s) {
816 for (int d = 0; d < 4; ++d) {
817 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
818 kSymmetricZeroPoint;
819 }
820 }
821 }
822 // B. Simulate between-register transposition.
823 for (int x = 0; x < 4; ++x) {
824 for (int y = 0; y < 4; ++y) {
825 tmp_transposed[x][0][y] = tmp_load[y][0][x];
826 tmp_transposed[x][1][y] = tmp_load[y][1][x];
827 }
828 }
829
830 // C and D are to be performed together as 4-byte stores in NEON code.
831 // C. Simulate between-register interleaving.
832 for (int x = 0; x < 4; ++x) {
833 for (int y = 0; y < 4; ++y) {
834 tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
835 tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
836 }
837 }
838 // D. Simulate mangled storage arrangement.
839 memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
840 memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
841 memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
842 memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
843
844 scratch_data += depth_advance;
845 input_data += 8;
846 }
847 scratch_data += width_advance;
848 input_data += input_depth_skip;
849 }
850 if (width_overall_micro_repeats > input_width_micro_repeats) {
851 TFLITE_DCHECK_EQ(width_overall_micro_repeats,
852 input_width_micro_repeats + 1);
853 TFLITE_DCHECK_GT(residual_width, 0);
854 // Figure out division of work (available input vs zero-ed).
855 const int adjusted_residual_width = residual_width;
856 // Load, then zero.
857 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
858 // A. Simulate register loading.
859 for (int x = 0; x < adjusted_residual_width; ++x) {
860 for (int s = 0; s < 2; ++s) {
861 for (int d = 0; d < 4; ++d) {
862 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
863 kSymmetricZeroPoint;
864 }
865 }
866 }
867 for (int x = adjusted_residual_width; x < 4; ++x) {
868 for (int s = 0; s < 2; ++s) {
869 for (int d = 0; d < 4; ++d) {
870 tmp_load[x][s][d] = 0;
871 }
872 }
873 }
874 // B. Simulate between-register transposition.
875 for (int x = 0; x < 4; ++x) {
876 for (int y = 0; y < 4; ++y) {
877 tmp_transposed[x][0][y] = tmp_load[y][0][x];
878 tmp_transposed[x][1][y] = tmp_load[y][1][x];
879 }
880 }
881
882 // C and D are to be performed together as 4-byte stores in NEON code.
883 // C. Simulate between-register interleaving.
884 for (int x = 0; x < 4; ++x) {
885 for (int y = 0; y < 4; ++y) {
886 tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
887 tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
888 }
889 }
890 // D. Simulate mangled storage arrangement.
891 memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
892 memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
893 memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
894 memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
895
896 scratch_data += depth_advance;
897 input_data += 8;
898 }
899 scratch_data += width_advance;
900 input_data += input_depth_skip;
901 }
902 scratch_data += height_advance;
903 }
904
905 TFLITE_DCHECK_EQ(scratch_data, scratch_block_data +
906 block_height * workspace_height_stride);
907 }
908 };
909
910 template <QuantizationType quantization_type>
911 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
912 quantization_type,
913 DepthwiseConvDepthMultiplication::kNoMultiplication,
914 /*max_padding=*/1> {
915 static inline void Run(
916 int32 height_block_number, int32 width_block_number,
917 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
918 input_block_data,
919 int8* scratch_block_data,
920 const DepthwiseConvDotProdParams* function_params) {
921 // Just use C model code for case of padding. Optimized versions merge the
922 // modifications therein to handle padding.
923 PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
924 quantization_type,
925 DepthwiseConvDepthMultiplication::kNoMultiplication,
926 /*max_padding=*/1>::Run(height_block_number,
927 width_block_number, input_block_data,
928 scratch_block_data, function_params);
929 }
930 };
931
932 template <QuantizationType quantization_type, int32 max_padding>
933 struct PackMacroBlock<
934 DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
935 DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
936 static inline void Run(
937 int32 height_block_number, int32 width_block_number,
938 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
939 input_block_data,
940 int8* scratch_block_data,
941 const DepthwiseConvDotProdParams* function_params) {
942 const int workspace_height_stride =
943 function_params->workspace_height_stride;
944 const int width_overall_micro_repeats =
945 function_params->input_width_overall_micro_repeats;
946 const int input_width_micro_repeats =
947 function_params->input_width_micro_repeats;
948 const int block_height = function_params->inbound_block_height;
949 const int residual_width = function_params->residual_width;
950 const int input_height_stride = function_params->input_height_stride;
951
952 const int padding_left = function_params->padding_left;
953 const int padding_right = function_params->padding_right;
954 const int padding_top = function_params->padding_top;
955 const int padding_bottom = function_params->padding_bottom;
956
957 constexpr int kSymmetricZeroPoint =
958 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
959
960 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
961
962 const bool leading_width_padding =
963 padding_left > 0 && width_block_number == 0;
964 const bool trailing_width_padding =
965 padding_right > 0 &&
966 width_block_number == (function_params->width_macro_count - 1);
967 const bool leading_height_padding =
968 padding_top > 0 && height_block_number < 0;
969 const bool trailing_height_padding =
970 padding_bottom > 0 &&
971 height_block_number == (function_params->height_macro_count - 1);
972
973 const int32 input_offset = function_params->input_offset;
974 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
975
976 // Work through one slice, by row, at a time.
977 int8* scratch_data_base = scratch_block_data;
978
979 int copy_block_height = block_height;
980 if (leading_height_padding) {
981 copy_block_height -= 1;
982 memset(scratch_data_base, -input_offset_difference,
983 workspace_height_stride + kWorkspaceExtension);
984 scratch_data_base += workspace_height_stride;
985 input_block_data += input_height_stride;
986 }
987 if (trailing_height_padding) {
988 copy_block_height -= 1;
989 }
990
991 int adjusted_residual_width =
992 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
993 : 4;
994
995 if (trailing_width_padding) {
996 adjusted_residual_width -= 1;
997 }
998 int start_width = 0;
999 if (leading_width_padding) {
1000 start_width = 1;
1001 input_block_data += 1;
1002 }
1003
1004 const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1005 adjusted_residual_width - start_width;
1006 // Adjusted so that later conditionals are simplified.
1007 const int copy_size_adjusted =
1008 trailing_width_padding ? copy_size + 1 : copy_size;
1009
1010 TFLITE_DCHECK_LE(
1011 copy_size,
1012 input_height_stride - width_block_number * input_width_micro_repeats);
1013 // We may drop up to stride-1 of trailing input.
1014 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1015
1016 // This is used to simulate what should happen in registers.
1017 int8 tmp_data[16];
1018
1019 int scratch_data_offset = 0;
1020 int input_block_offset = 0;
1021
1022 if (copy_size >= 16) {
1023 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1024 // Work through one slice, by row, at a time.
1025 int8* scratch_data = scratch_data_base + scratch_data_offset;
1026
1027 int copy_done = 0;
1028
1029 // The surrounding condition ensures that we always need at least one
1030 // iteration of the main copy loop. In the case of leading width
1031 // padding, we unroll this specially.
1032 if (leading_width_padding) {
1033 memcpy(tmp_data + 1, input_block_data + input_block_offset, 15);
1034 for (int i = 0; i < 16; ++i) {
1035 tmp_data[i] += -kSymmetricZeroPoint;
1036 }
1037 tmp_data[0] = -input_offset_difference;
1038 memcpy(scratch_data, tmp_data, 16);
1039 copy_done += 15;
1040 }
1041
1042 // Main copy loop.
1043 for (; (copy_done + 16) <= copy_size; copy_done += 16) {
1044 memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1045 16);
1046 for (int i = 0; i < 16; ++i) {
1047 tmp_data[i] += -kSymmetricZeroPoint;
1048 }
1049 TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
1050 memcpy(&scratch_data[start_width + copy_done], tmp_data, 16);
1051 }
1052
1053 const int copy_remaining = copy_size - copy_done;
1054 // Total amount
1055 // = copy_size - copy_done + 4 - adjusted_residual_width
1056 // = width_overall_micro_repeats * 4 - start_width - copy_done.
1057 // Undone micro blocks
1058 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1059
1060 // Conditional is (copy_remaining > 0 || trailing_width_padding).
1061 if (copy_done < copy_size_adjusted) {
1062 // Employ overlapping-load strategy in order to load full register,
1063 // but use only part.
1064 memcpy(tmp_data,
1065 input_block_data + input_block_offset + copy_done -
1066 (16 - copy_remaining),
1067 16);
1068 // Shift to select the part that we need.
1069 for (int i = 0; i < copy_remaining; ++i) {
1070 tmp_data[i] = tmp_data[(16 - copy_remaining) + i];
1071 }
1072 for (int i = 0; i < 16; ++i) {
1073 tmp_data[i] += -kSymmetricZeroPoint;
1074 }
1075 // Apply padding to remainder, some unnecessary but costless in regs.
1076 for (int i = copy_remaining; i < 16; ++i) {
1077 tmp_data[i] = -input_offset_difference;
1078 }
1079 const int final_repeats =
1080 width_overall_micro_repeats - (start_width + copy_done) / 4;
1081 for (int i = 0; i < final_repeats; ++i) {
1082 memcpy(&scratch_data[start_width + copy_done], tmp_data + 4 * i, 4);
1083 copy_done += 4;
1084 }
1085 }
1086 memset(scratch_data + start_width + copy_done, -input_offset_difference,
1087 kWorkspaceExtension);
1088
1089 scratch_data_offset += workspace_height_stride;
1090 input_block_offset += input_height_stride;
1091 }
1092 } else if (copy_size >= 4) {
1093 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1094 // Work through one slice, by row, at a time.
1095 int8* scratch_data = scratch_data_base + scratch_data_offset;
1096
1097 int copy_done = 0;
1098
1099 // The surrounding condition ensures that we always need at least one
1100 // iteration of the main copy loop. In the case of leading width
1101 // padding, we unroll this specially.
1102 if (leading_width_padding) {
1103 memcpy(tmp_data + 1, input_block_data + input_block_offset, 3);
1104 for (int i = 0; i < 4; ++i) {
1105 tmp_data[i] += -kSymmetricZeroPoint;
1106 }
1107 tmp_data[0] = -input_offset_difference;
1108 memcpy(scratch_data, tmp_data, 4);
1109 copy_done += 3;
1110 }
1111
1112 for (; (copy_done + 4) <= copy_size; copy_done += 4) {
1113 memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1114 4);
1115 for (int i = 0; i < 4; ++i) {
1116 tmp_data[i] += -kSymmetricZeroPoint;
1117 }
1118 // Perform as 4 int32 stores, because that is our alignment.
1119 memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1120 }
1121
1122 // Total amount
1123 // = copy_size - copy_done + 4 - adjusted_residual_width
1124 // = width_overall_micro_repeats * 4 - start_width - copy_done.
1125 // Undone micro blocks
1126 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1127 const int copy_remaining = copy_size - copy_done;
1128 // Conditional is (copy_remaining > 0 || trailing_width_padding).
1129 if (copy_done < copy_size_adjusted) {
1130 TFLITE_DCHECK_LT(copy_remaining, 4);
1131 // Employ overlapping-load strategy in order to load full register,
1132 // but use only part.
1133 memcpy(tmp_data,
1134 input_block_data + input_block_offset + copy_done -
1135 (4 - copy_remaining),
1136 4);
1137 // Shift to select the part that we need.
1138 for (int i = 0; i < copy_remaining; ++i) {
1139 tmp_data[i] = tmp_data[(4 - copy_remaining) + i];
1140 }
1141 for (int i = 0; i < 4; ++i) {
1142 tmp_data[i] += -kSymmetricZeroPoint;
1143 }
1144 // Apply padding to remainder, some unnecessary but costless in regs.
1145 for (int i = copy_remaining; i < 4; ++i) {
1146 tmp_data[i] = -input_offset_difference;
1147 }
1148 memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1149 copy_done += 4;
1150 }
1151 memset(scratch_data + start_width + copy_done, -input_offset_difference,
1152 kWorkspaceExtension);
1153
1154 scratch_data_offset += workspace_height_stride;
1155 input_block_offset += input_height_stride;
1156 }
1157 } else if (width_overall_micro_repeats == 2) {
1158 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1159 // Apply padding by quick fill of whole reg.
1160 for (int i = 0; i < 8; ++i) {
1161 tmp_data[i] = -input_offset;
1162 }
1163 for (int i = 0; i < copy_size; ++i) {
1164 // Apply shift-left insert, tmp_data as both operands.
1165 // The zero-index byte is left unchanged.
1166 for (int i = 7; i > 0; --i) {
1167 tmp_data[i] = tmp_data[i - 1];
1168 }
1169 tmp_data[1] =
1170 input_block_data[input_block_offset + (copy_size - 1 - i)];
1171 }
1172 if (!leading_width_padding) {
1173 // Remove leading padding, junking trailing byte, OK because max size
1174 // is less than 8.
1175 TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1176 for (int i = 0; i < 7; ++i) {
1177 tmp_data[i] = tmp_data[i + 1];
1178 }
1179 }
1180 for (int i = 0; i < 8; ++i) {
1181 tmp_data[i] += -kSymmetricZeroPoint;
1182 }
1183 memcpy(scratch_data_base + scratch_data_offset, tmp_data, 8);
1184 memset(scratch_data_base + scratch_data_offset + 8,
1185 -input_offset_difference, kWorkspaceExtension);
1186
1187 scratch_data_offset += workspace_height_stride;
1188 input_block_offset += input_height_stride;
1189 }
1190 } else {
1191 TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
1192 // This path is basically the same as the preceding, 2-micro-block one,
1193 // but here we simply store fewer bytes.
1194 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1195 // Apply padding by quick fill of whole reg.
1196 for (int i = 0; i < 8; ++i) {
1197 tmp_data[i] = -input_offset;
1198 }
1199 for (int i = 0; i < copy_size; ++i) {
1200 // Apply shift-left insert, tmp_data as both operands.
1201 // The zero-index byte is left unchanged.
1202 for (int i = 7; i > 0; --i) {
1203 tmp_data[i] = tmp_data[i - 1];
1204 }
1205 tmp_data[1] =
1206 input_block_data[input_block_offset + (copy_size - 1 - i)];
1207 }
1208 if (!leading_width_padding) {
1209 // Remove leading padding, junking trailing byte, OK because max size
1210 // is less than 8.
1211 TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1212 for (int i = 0; i < 7; ++i) {
1213 tmp_data[i] = tmp_data[i + 1];
1214 }
1215 }
1216 for (int i = 0; i < 8; ++i) {
1217 tmp_data[i] += -kSymmetricZeroPoint;
1218 }
1219 memcpy(scratch_data_base + scratch_data_offset, tmp_data, 4);
1220 memset(scratch_data_base + scratch_data_offset + 4,
1221 -input_offset_difference, kWorkspaceExtension);
1222
1223 scratch_data_offset += workspace_height_stride;
1224 input_block_offset += input_height_stride;
1225 }
1226 }
1227
1228 scratch_data_base += copy_block_height * workspace_height_stride;
1229
1230 if (trailing_height_padding) {
1231 memset(scratch_data_base, -input_offset_difference,
1232 workspace_height_stride + kWorkspaceExtension);
1233 scratch_data_base += workspace_height_stride;
1234 }
1235
1236 TFLITE_DCHECK_EQ(
1237 scratch_data_base,
1238 scratch_block_data + block_height * workspace_height_stride);
1239 }
1240 };
1241 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
1242 // of templated functions are selected.
1243 //
1244 // End of code section containing intermediate code transformation.
1245
1246 #ifdef USE_NEON
1247 template <QuantizationType quantization_type>
1248 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1249 quantization_type,
1250 DepthwiseConvDepthMultiplication::kNoMultiplication,
1251 /*max_padding=*/0> {
1252 static inline void PackMacroBlockIntrinsics(
1253 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1254 input_block_data,
1255 int8* scratch_block_data,
1256 const DepthwiseConvDotProdParams* function_params) {
1257 TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
1258 TFLITE_DCHECK_EQ(function_params->padding_top, 0);
1259 TFLITE_DCHECK_EQ(function_params->padding_left, 0);
1260 TFLITE_DCHECK_EQ(function_params->padding_right, 0);
1261 const int workspace_height_stride =
1262 function_params->workspace_height_stride;
1263 const int width_overall_micro_repeats =
1264 function_params->input_width_overall_micro_repeats;
1265 const int input_width_micro_repeats =
1266 function_params->input_width_micro_repeats;
1267 const int depth_micro_repeats = function_params->depth_micro_repeats;
1268 const int block_height = function_params->inbound_block_height;
1269 const int residual_width = function_params->residual_width;
1270 const int input_height_stride = function_params->input_height_stride;
1271 const int input_depth = function_params->input_depth;
1272
1273 TFLITE_DCHECK_GE(depth_micro_repeats, 0);
1274 constexpr uint8 kSignBit =
1275 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1276 const int micro_block_size = 4 * 8;
1277 const int depth_advance = width_overall_micro_repeats * micro_block_size;
1278 const int width_advance =
1279 micro_block_size *
1280 (1 - depth_micro_repeats * width_overall_micro_repeats);
1281 const int height_advance = workspace_height_stride -
1282 width_overall_micro_repeats * micro_block_size;
1283 const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1284
1285 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1286 // code. Note the blocks of 4x4 are still interleaved down the depth.
1287 int8x16_t work_reg_a;
1288 int8x16_t work_reg_b;
1289
1290 // Effect subtraction of zero-point = 128 by XOR of sign bit.
1291 const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
1292
1293 // Work through one slice, by row, at a time.
1294 int8* scratch_data_0 = scratch_block_data;
1295
1296 for (int k_height = 0; k_height < block_height; ++k_height) {
1297 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1298 input_data_0 = input_block_data;
1299 int8x16_t input_data_a;
1300 int8x16_t input_data_b;
1301 int8x16_t input_data_c;
1302 int8x16_t input_data_d;
1303
1304 // Traverse the width one point at a time, but the depth in (micro) blocks
1305 // of size 8.
1306 //
1307 // The depth and width margins, which are filled with "zeros", may be
1308 // larger than is strictly needed to calculate output. This is because the
1309 // conv calculation is performed across complete micro blocks.
1310 for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
1311 int8x16_t work_reg_a_sp;
1312 int8x16_t work_reg_b_sp;
1313
1314 int i_depth = 0;
1315
1316 if (depth_micro_repeats >= 2) {
1317 i_depth += 2;
1318
1319 input_data_a = util_vld1q_x8(input_data_0);
1320 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1321 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1322 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1323 input_data_0 += 16;
1324
1325 for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1326 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1327 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1328 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1329 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1330 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1331 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1332 }
1333
1334 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1335 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1336 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1337
1338 input_data_a = util_vld1q_x8(input_data_0);
1339 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1340 vst1q_s8(scratch_data_0, work_reg_a);
1341 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1342
1343 scratch_data_0 += depth_advance;
1344
1345 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1346 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1347 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1348 }
1349
1350 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1351 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1352 vst1q_s8(scratch_data_0, work_reg_a_sp);
1353 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1354
1355 scratch_data_0 += depth_advance;
1356 input_data_0 += 16;
1357 }
1358
1359 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1360 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1361 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1362 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1363 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1364 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1365 }
1366 vst1q_s8(scratch_data_0, work_reg_a);
1367 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1368
1369 scratch_data_0 += depth_advance;
1370
1371 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1372 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1373 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1374 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1375 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1376 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1377 }
1378
1379 vst1q_s8(scratch_data_0, work_reg_a_sp);
1380 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1381
1382 scratch_data_0 += depth_advance;
1383 }
1384 for (; i_depth < depth_micro_repeats; ++i_depth) {
1385 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1386 input_data_b =
1387 vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
1388 input_data_c =
1389 vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
1390 input_data_d =
1391 vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
1392 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1393 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1394
1395 input_data_0 += 8;
1396
1397 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1398 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1399 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1400 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1401 }
1402
1403 vst1q_s8(scratch_data_0, work_reg_a);
1404 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1405
1406 scratch_data_0 += depth_advance;
1407 }
1408 scratch_data_0 += width_advance;
1409 input_data_0 += input_depth_skip;
1410 }
1411 if (width_overall_micro_repeats > input_width_micro_repeats) {
1412 TFLITE_DCHECK_EQ(width_overall_micro_repeats,
1413 input_width_micro_repeats + 1);
1414 TFLITE_DCHECK_GT(residual_width, 0);
1415 TFLITE_DCHECK_LT(residual_width, 4);
1416 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1417 input_data_c = vdupq_n_u8(kSignBit);
1418 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1419 input_data_d = vdupq_n_u8(kSignBit);
1420 if (residual_width > 1) {
1421 input_data_b =
1422 vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
1423 if (residual_width == 3) {
1424 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1425 input_data_c, 0);
1426 }
1427 }
1428 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1429 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1430
1431 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1432 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1433 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1434 }
1435 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1436
1437 vst1q_s8(scratch_data_0, work_reg_a);
1438 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1439
1440 scratch_data_0 += depth_advance;
1441 input_data_0 += 8;
1442 }
1443 scratch_data_0 += width_advance;
1444 input_data_0 += input_depth_skip;
1445 }
1446
1447 scratch_data_0 += height_advance;
1448 input_block_data += input_height_stride;
1449 }
1450 TFLITE_DCHECK_EQ(
1451 scratch_data_0,
1452 scratch_block_data + block_height * workspace_height_stride);
1453 }
1454
1455 static inline void Run(
1456 int32 height_block_number, int32 width_block_number,
1457 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1458 input_block_data,
1459 int8* scratch_block_data,
1460 const DepthwiseConvDotProdParams* function_params) {
1461 #ifdef __aarch64__
1462 PreloadInputBlock(input_block_data, function_params);
1463 #endif
1464 PackMacroBlockIntrinsics(input_block_data, scratch_block_data,
1465 function_params);
1466 }
1467 };
1468
1469 template <QuantizationType quantization_type>
1470 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1471 quantization_type,
1472 DepthwiseConvDepthMultiplication::kNoMultiplication,
1473 /*max_padding=*/1> {
1474 static inline void PackMacroBlockIntrinsics(
1475 int32 height_block_number, int32 width_block_number,
1476 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1477 input_block_data,
1478 int8* scratch_block_data,
1479 const DepthwiseConvDotProdParams* function_params) {
1480 constexpr uint8 kSignBit =
1481 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1482
1483 const int workspace_height_stride =
1484 function_params->workspace_height_stride;
1485 const int width_overall_micro_repeats =
1486 function_params->input_width_overall_micro_repeats;
1487 const int input_width_micro_repeats =
1488 function_params->input_width_micro_repeats;
1489 const int depth_micro_repeats = function_params->depth_micro_repeats;
1490 const int block_height = function_params->inbound_block_height;
1491 const int residual_width = function_params->residual_width;
1492 const int input_height_stride = function_params->input_height_stride;
1493 const int input_depth = function_params->input_depth;
1494
1495 const int padding_left = function_params->padding_left;
1496 const int padding_right = function_params->padding_right;
1497 const int padding_top = function_params->padding_top;
1498 const int padding_bottom = function_params->padding_bottom;
1499
1500 TFLITE_DCHECK_GT(depth_micro_repeats, 0);
1501 constexpr int kSymmetricZeroPoint =
1502 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1503
1504 const int micro_block_size = 4 * 8;
1505 const int depth_advance = width_overall_micro_repeats * micro_block_size;
1506 const int width_advance =
1507 micro_block_size *
1508 (1 - depth_micro_repeats * width_overall_micro_repeats);
1509 const int height_advance = workspace_height_stride -
1510 width_overall_micro_repeats * micro_block_size;
1511 const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1512
1513 const bool leading_width_padding =
1514 padding_left > 0 && width_block_number == 0;
1515 const bool trailing_width_padding =
1516 padding_right > 0 &&
1517 width_block_number == (function_params->width_macro_count - 1);
1518 const bool leading_height_padding =
1519 padding_top > 0 && height_block_number < 0;
1520 const bool trailing_height_padding =
1521 padding_bottom > 0 &&
1522 height_block_number == (function_params->height_macro_count - 1);
1523
1524 const int32 input_offset = function_params->input_offset;
1525 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1526
1527 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1528 // code. Note the blocks of 4x4 are still interleaved down the depth.
1529 int8x16_t work_reg_a;
1530 int8x16_t work_reg_b;
1531
1532 // Effect subtraction of zero-point = 128 by XOR of sign bit.
1533 const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
1534
1535 // Work through one slice, by row, at a time.
1536 int8* scratch_data_0 = scratch_block_data;
1537
1538 int copy_block_height = block_height;
1539 if (leading_height_padding) {
1540 copy_block_height -= 1;
1541 memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1542 scratch_data_0 += workspace_height_stride;
1543 input_block_data += input_height_stride;
1544 }
1545 if (trailing_height_padding) {
1546 copy_block_height -= 1;
1547 }
1548
1549 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1550 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1551 input_data_0 = input_block_data;
1552 int8x16_t input_data_a;
1553 int8x16_t input_data_b;
1554 int8x16_t input_data_c;
1555 int8x16_t input_data_d;
1556
1557 // Traverse the width one point at a time, but the depth in (micro) blocks
1558 // of size 8.
1559 //
1560 // The depth and width margins, which are filled with "zeros", may be
1561 // larger than is strictly needed to calculate output. This is because the
1562 // conv calculation is performed across complete micro blocks.
1563 for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
1564 // Figure out division of work (available input vs zero-ed).
1565 int adjusted_residual_width =
1566 j_width == (input_width_micro_repeats) ? residual_width : 4;
1567
1568 if (trailing_width_padding &&
1569 j_width == (width_overall_micro_repeats - 1)) {
1570 adjusted_residual_width -= 1;
1571 }
1572 int start_width = 0;
1573 if (leading_width_padding && j_width == 0) {
1574 start_width = 1;
1575 }
1576 if (start_width == 0) {
1577 if (adjusted_residual_width == 4) {
1578 int8x16_t work_reg_a_sp;
1579 int8x16_t work_reg_b_sp;
1580
1581 int i_depth = 0;
1582
1583 if (depth_micro_repeats >= 2) {
1584 i_depth += 2;
1585
1586 input_data_a = util_vld1q_x8(input_data_0);
1587 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1588 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1589 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1590 input_data_0 += 16;
1591
1592 for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1593 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1594 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1595 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1596 if (quantization_type ==
1597 QuantizationType::kNonPerChannelUint8) {
1598 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1599 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1600 }
1601
1602 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1603 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1604 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1605
1606 input_data_a = util_vld1q_x8(input_data_0);
1607 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1608 vst1q_s8(scratch_data_0, work_reg_a);
1609 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1610
1611 scratch_data_0 += depth_advance;
1612
1613 if (quantization_type ==
1614 QuantizationType::kNonPerChannelUint8) {
1615 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1616 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1617 }
1618
1619 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1620 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1621 vst1q_s8(scratch_data_0, work_reg_a_sp);
1622 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1623
1624 scratch_data_0 += depth_advance;
1625 input_data_0 += 16;
1626 }
1627
1628 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1629 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1630 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1631 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1632 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1633 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1634 }
1635 vst1q_s8(scratch_data_0, work_reg_a);
1636 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1637
1638 scratch_data_0 += depth_advance;
1639
1640 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1641 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1642 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1643 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1644 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1645 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1646 }
1647
1648 vst1q_s8(scratch_data_0, work_reg_a_sp);
1649 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1650
1651 scratch_data_0 += depth_advance;
1652 }
1653 for (; i_depth < depth_micro_repeats; ++i_depth) {
1654 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1655 input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1656 input_data_b, 0);
1657 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1658 input_data_c, 0);
1659 input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1660 input_data_d, 0);
1661 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1662 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1663
1664 input_data_0 += 8;
1665
1666 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1667 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1668 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1669 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1670 }
1671
1672 vst1q_s8(scratch_data_0, work_reg_a);
1673 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1674
1675 scratch_data_0 += depth_advance;
1676 }
1677 scratch_data_0 += width_advance;
1678 input_data_0 += input_depth_skip;
1679 } else {
1680 TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1681 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1682 input_data_a = vdupq_n_u8(-input_offset);
1683 input_data_b = vdupq_n_u8(-input_offset);
1684 input_data_c = vdupq_n_u8(-input_offset);
1685 input_data_d = vdupq_n_u8(-input_offset);
1686 if (adjusted_residual_width > 0) {
1687 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1688 if (adjusted_residual_width > 1) {
1689 input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1690 input_data_b, 0);
1691 if (adjusted_residual_width == 3) {
1692 input_data_c = vld1q_lane_s8x8(
1693 input_data_0 + 2 * input_depth, input_data_c, 0);
1694 }
1695 }
1696 }
1697 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1698 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1699
1700 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1701 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1702 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1703 }
1704 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1705
1706 vst1q_s8(scratch_data_0, work_reg_a);
1707 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1708
1709 scratch_data_0 += depth_advance;
1710 input_data_0 += 8;
1711 }
1712 scratch_data_0 += width_advance;
1713 input_data_0 += input_depth_skip;
1714 }
1715 } else {
1716 if (adjusted_residual_width == 4) {
1717 int8x16_t work_reg_a_sp;
1718 int8x16_t work_reg_b_sp;
1719
1720 int i_depth = 0;
1721
1722 if (depth_micro_repeats >= 2) {
1723 i_depth += 2;
1724
1725 input_data_a = vdupq_n_u8(-input_offset);
1726 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1727 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1728 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1729 input_data_0 += 16;
1730
1731 for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1732 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1733 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1734 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1735 if (quantization_type ==
1736 QuantizationType::kNonPerChannelUint8) {
1737 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1738 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1739 }
1740
1741 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1742 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1743 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1744
1745 input_data_a = vdupq_n_u8(-input_offset);
1746 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1747 vst1q_s8(scratch_data_0, work_reg_a);
1748 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1749
1750 scratch_data_0 += depth_advance;
1751
1752 if (quantization_type ==
1753 QuantizationType::kNonPerChannelUint8) {
1754 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1755 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1756 }
1757
1758 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1759 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1760 vst1q_s8(scratch_data_0, work_reg_a_sp);
1761 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1762
1763 scratch_data_0 += depth_advance;
1764 input_data_0 += 16;
1765 }
1766
1767 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1768 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1769 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1770 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1771 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1772 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1773 }
1774 vst1q_s8(scratch_data_0, work_reg_a);
1775 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1776
1777 scratch_data_0 += depth_advance;
1778
1779 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1780 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1781 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1782 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1783 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1784 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1785 }
1786
1787 vst1q_s8(scratch_data_0, work_reg_a_sp);
1788 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1789
1790 scratch_data_0 += depth_advance;
1791 }
1792 for (; i_depth < depth_micro_repeats; ++i_depth) {
1793 input_data_a = vdupq_n_u8(-input_offset);
1794 input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1795 input_data_b, 0);
1796 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1797 input_data_c, 0);
1798 input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1799 input_data_d, 0);
1800 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1801 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1802
1803 input_data_0 += 8;
1804
1805 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1806 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1807 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1808 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1809 }
1810
1811 vst1q_s8(scratch_data_0, work_reg_a);
1812 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1813
1814 scratch_data_0 += depth_advance;
1815 }
1816 scratch_data_0 += width_advance;
1817 input_data_0 += input_depth_skip;
1818 } else {
1819 TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1820
1821 for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1822 input_data_a = vdupq_n_u8(-input_offset);
1823 input_data_b = vdupq_n_u8(-input_offset);
1824 input_data_c = vdupq_n_u8(-input_offset);
1825 input_data_d = vdupq_n_u8(-input_offset);
1826 // Skip loading first column.
1827 if (adjusted_residual_width > 1) {
1828 input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1829 input_data_b, 0);
1830 if (adjusted_residual_width == 3) {
1831 input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1832 input_data_c, 0);
1833 }
1834 }
1835 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1836 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1837
1838 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1839 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1840 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1841 }
1842 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1843
1844 vst1q_s8(scratch_data_0, work_reg_a);
1845 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1846
1847 scratch_data_0 += depth_advance;
1848 input_data_0 += 8;
1849 }
1850 scratch_data_0 += width_advance;
1851 input_data_0 += input_depth_skip;
1852 }
1853 }
1854 }
1855 scratch_data_0 += height_advance;
1856 input_block_data += input_height_stride;
1857 }
1858
1859 if (trailing_height_padding) {
1860 memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1861 scratch_data_0 += workspace_height_stride;
1862 }
1863
1864 TFLITE_DCHECK_EQ(
1865 scratch_data_0,
1866 scratch_block_data + block_height * workspace_height_stride);
1867 }
1868
1869 static inline void Run(
1870 int32 height_block_number, int32 width_block_number,
1871 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1872 input_block_data,
1873 int8* scratch_block_data,
1874 const DepthwiseConvDotProdParams* function_params) {
1875 #ifdef __aarch64__
1876 PreloadInputBlock(input_block_data, function_params);
1877 #endif
1878
1879 PackMacroBlockIntrinsics(height_block_number, width_block_number,
1880 input_block_data, scratch_block_data,
1881 function_params);
1882 }
1883 };
1884
1885 template <QuantizationType quantization_type>
1886 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1887 quantization_type,
1888 DepthwiseConvDepthMultiplication::kUnitInputDepth,
1889 /*max_padding=*/1> {
1890 static inline void PackMacroBlockIntrinsics(
1891 int32 height_block_number, int32 width_block_number,
1892 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1893 input_block_data,
1894 int8* scratch_block_data,
1895 const DepthwiseConvDotProdParams* function_params) {
1896 const int workspace_height_stride =
1897 function_params->workspace_height_stride;
1898 const int width_overall_micro_repeats =
1899 function_params->input_width_overall_micro_repeats;
1900 const int input_width_micro_repeats =
1901 function_params->input_width_micro_repeats;
1902 const int block_height = function_params->inbound_block_height;
1903 const int residual_width = function_params->residual_width;
1904 const int input_height_stride = function_params->input_height_stride;
1905
1906 const int padding_left = function_params->padding_left;
1907 const int padding_right = function_params->padding_right;
1908 const int padding_top = function_params->padding_top;
1909 const int padding_bottom = function_params->padding_bottom;
1910
1911 constexpr int kSymmetricZeroPoint =
1912 QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1913
1914 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
1915
1916 const bool leading_width_padding =
1917 padding_left > 0 && width_block_number == 0;
1918 const bool trailing_width_padding =
1919 padding_right > 0 &&
1920 width_block_number == (function_params->width_macro_count - 1);
1921 const bool leading_height_padding =
1922 padding_top > 0 && height_block_number < 0;
1923 const bool trailing_height_padding =
1924 padding_bottom > 0 &&
1925 height_block_number == (function_params->height_macro_count - 1);
1926
1927 const int32 input_offset = function_params->input_offset;
1928 const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1929
1930 // Work through one slice, by row, at a time.
1931 int8* scratch_data_base = scratch_block_data;
1932
1933 int copy_block_height = block_height;
1934 if (leading_height_padding) {
1935 copy_block_height -= 1;
1936 memset(scratch_data_base, -input_offset_difference,
1937 workspace_height_stride + kWorkspaceExtension);
1938 scratch_data_base += workspace_height_stride;
1939 input_block_data += input_height_stride;
1940 }
1941 if (trailing_height_padding) {
1942 copy_block_height -= 1;
1943 }
1944
1945 int adjusted_residual_width =
1946 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
1947 : 4;
1948
1949 if (trailing_width_padding) {
1950 adjusted_residual_width -= 1;
1951 }
1952 int start_width = 0;
1953 if (leading_width_padding) {
1954 start_width = 1;
1955 input_block_data += 1;
1956 }
1957
1958 const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1959 adjusted_residual_width - start_width;
1960 // Adjusted so that later conditionals are simplified.
1961 const int copy_size_adjusted =
1962 trailing_width_padding ? copy_size + 1 : copy_size;
1963
1964 TFLITE_DCHECK_LE(
1965 copy_size,
1966 input_height_stride - width_block_number * input_width_micro_repeats);
1967 // We may drop up to stride-1 of trailing input.
1968 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1969
1970 int scratch_data_offset = 0;
1971 int input_block_offset = 0;
1972
1973 constexpr uint8 kSignBit =
1974 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1975
1976 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1977 // code. Note the blocks of 4x4 are still interleaved down the depth.
1978 int8x16_t work_reg;
1979 int8x8_t half_work_reg;
1980 int8x8_t padding_mask;
1981
1982 // Effect subtraction of zero-point = 128 by XOR of sign bit.
1983 const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
1984 const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
1985 padding_mask = vdup_n_s8(-1);
1986 half_work_reg = vdup_n_s8(0);
1987
1988 if (copy_size >= 16) {
1989 const int copy_remaining = (copy_size + start_width) & 0x7;
1990 padding_mask = vreinterpret_s8_s64(vshl_s64(
1991 vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
1992
1993 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1994 // Work through one slice, by row, at a time.
1995 int8* scratch_data = scratch_data_base + scratch_data_offset;
1996
1997 int copy_done = 0;
1998
1999 // The surrounding condition ensures that we always need at least one
2000 // iteration of the main copy loop. In the case of leading width
2001 // padding, we unroll this specially.
2002 if (leading_width_padding) {
2003 work_reg = util_vld1q_x8(input_block_data + input_block_offset);
2004 work_reg = vextq_s8(padding_reg, work_reg, 15);
2005 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2006 work_reg = veorq_s8(work_reg, sign_bit);
2007 }
2008 vst1q_s8(scratch_data, work_reg);
2009 copy_done += 15;
2010 }
2011
2012 // Main copy loop.
2013 for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2014 work_reg =
2015 util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2016 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2017 work_reg = veorq_s8(work_reg, sign_bit);
2018 }
2019 TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
2020 vst1q_s8(scratch_data + start_width + copy_done, work_reg);
2021 }
2022
2023 if (copy_done + 8 <= copy_size) {
2024 half_work_reg =
2025 util_vld1_x8(input_block_data + input_block_offset + copy_done);
2026 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2027 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2028 }
2029 TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2030 vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2031 copy_done += 8;
2032 }
2033
2034 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2035 // Total amount
2036 // = copy_size - copy_done + 4 - adjusted_residual_width
2037 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2038 // Undone micro blocks
2039 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2040
2041 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2042 if (copy_done < copy_size_adjusted) {
2043 // Employ overlapping-load strategy in order to load full register,
2044 // but use only part.
2045 // This has the advantage of resulting in zeros after shifting.
2046 half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2047 copy_size - 8);
2048
2049 half_work_reg = vreinterpret_s8_s64(
2050 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2051 vdup_n_s64(-8 * (8 - copy_remaining))));
2052 half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2053 vget_low_s8(padding_reg), half_work_reg);
2054
2055 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2056 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2057 }
2058 TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2059 vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2060 }
2061
2062 // Trailing guard.
2063 vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2064 vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
2065
2066 scratch_data_offset += workspace_height_stride;
2067 input_block_offset += input_height_stride;
2068 }
2069 } else if (copy_size >= 4) {
2070 const int copy_remaining = (copy_size + start_width) & 0x3;
2071 padding_mask = vreinterpret_s8_s64(vshl_s64(
2072 vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2073
2074 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2075 // Work through one slice, by row, at a time.
2076 int8* scratch_data = scratch_data_base + scratch_data_offset;
2077
2078 int copy_done = 0;
2079
2080 // The surrounding condition ensures that we always need at least one
2081 // iteration of the main copy loop. In the case of leading width
2082 // padding, we unroll this specially.
2083 if (leading_width_padding) {
2084 half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
2085 half_work_reg, 0);
2086 half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
2087 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2088 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2089 }
2090 vst1_lane_8x4(scratch_data, half_work_reg, 0);
2091 copy_done += 3;
2092 }
2093
2094 // Main copy loop.
2095 for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2096 half_work_reg =
2097 vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2098 half_work_reg, 0);
2099 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2100 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2101 }
2102 TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2103 vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
2104 0);
2105 }
2106
2107 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2108 // Total amount
2109 // = copy_size - copy_done + 4 - adjusted_residual_width
2110 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2111 // Undone micro blocks
2112 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2113
2114 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2115 if (copy_done < copy_size_adjusted) {
2116 TFLITE_DCHECK_LT(copy_remaining, 4);
2117 // Employ overlapping-load strategy in order to load full register,
2118 // but use only part.
2119 // This has the advantage of resulting in zeros after shifting.
2120 half_work_reg = vld1_lane_8x4(
2121 input_block_data + input_block_offset + copy_size - 4,
2122 half_work_reg, 0);
2123
2124 half_work_reg = vreinterpret_s8_s64(
2125 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2126 vdup_n_s64(-8 * (4 - copy_remaining))));
2127 half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2128 vget_low_s8(padding_reg), half_work_reg);
2129
2130 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2131 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2132 }
2133 TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2134 vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
2135 0);
2136 copy_done += 4;
2137 }
2138 // Trailing guard.
2139 vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
2140 vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
2141 0);
2142 vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
2143 0);
2144 vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
2145 half_work_reg, 0);
2146
2147 scratch_data_offset += workspace_height_stride;
2148 input_block_offset += input_height_stride;
2149 }
2150 } else if (width_overall_micro_repeats == 2) {
2151 // Special case of 1 + 3 + 1, padding + copy + padding.
2152 // This is rarely executed in practice.
2153 TFLITE_DCHECK_EQ(copy_size, 3);
2154 TFLITE_DCHECK_EQ(start_width, 1);
2155 TFLITE_DCHECK(leading_width_padding);
2156 TFLITE_DCHECK(trailing_width_padding);
2157
2158 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2159 half_work_reg = vdup_n_u8(-input_offset);
2160 half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
2161 input_block_data + input_block_offset),
2162 half_work_reg, 1);
2163 half_work_reg =
2164 vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2165 input_block_offset + 1),
2166 half_work_reg, 2);
2167 half_work_reg =
2168 vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2169 input_block_offset + 2),
2170 half_work_reg, 3);
2171
2172 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2173 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2174 }
2175 TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
2176 vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
2177
2178 // Trailing guard.
2179 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
2180 half_work_reg, 0);
2181 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
2182 half_work_reg, 0);
2183 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
2184 half_work_reg, 0);
2185 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
2186 half_work_reg, 0);
2187
2188 scratch_data_offset += workspace_height_stride;
2189 input_block_offset += input_height_stride;
2190 }
2191 } else {
2192 TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2193 const int copy_remaining = (copy_size + start_width) & 0x3;
2194 padding_mask = vreinterpret_s8_s64(vshl_s64(
2195 vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2196 if (leading_width_padding) {
2197 padding_mask = vset_lane_u8(255, padding_mask, 0);
2198 }
2199
2200 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2201 for (int i = 0; i < copy_size; ++i) {
2202 half_work_reg = vreinterpret_s8_s64(
2203 vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2204 half_work_reg = vld1_lane_s8(
2205 reinterpret_cast<const int8*>(
2206 input_block_data + input_block_offset + copy_size - 1 - i),
2207 half_work_reg, 0);
2208 }
2209 if (leading_width_padding) {
2210 half_work_reg = vreinterpret_s8_s64(
2211 vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2212 }
2213 half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2214 vget_low_s8(padding_reg), half_work_reg);
2215
2216 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2217 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2218 }
2219 TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2220 vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2221 0);
2222
2223 // Trailing guard.
2224 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
2225 half_work_reg, 0);
2226 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
2227 half_work_reg, 0);
2228 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
2229 half_work_reg, 0);
2230 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
2231 half_work_reg, 0);
2232
2233 scratch_data_offset += workspace_height_stride;
2234 input_block_offset += input_height_stride;
2235 }
2236 }
2237
2238 scratch_data_base += copy_block_height * workspace_height_stride;
2239
2240 if (trailing_height_padding) {
2241 memset(scratch_data_base, -input_offset_difference,
2242 workspace_height_stride + kWorkspaceExtension);
2243 scratch_data_base += workspace_height_stride;
2244 }
2245
2246 TFLITE_DCHECK_EQ(
2247 scratch_data_base,
2248 scratch_block_data + block_height * workspace_height_stride);
2249 }
2250
2251 static inline void Run(
2252 int32 height_block_number, int32 width_block_number,
2253 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2254 input_block_data,
2255 int8* scratch_block_data,
2256 const DepthwiseConvDotProdParams* function_params) {
2257 #ifdef __aarch64__
2258 PreloadInputBlock(input_block_data, function_params);
2259 #endif
2260
2261 PackMacroBlockIntrinsics(height_block_number, width_block_number,
2262 input_block_data, scratch_block_data,
2263 function_params);
2264 }
2265 };
2266
2267 template <QuantizationType quantization_type>
2268 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
2269 quantization_type,
2270 DepthwiseConvDepthMultiplication::kUnitInputDepth,
2271 /*max_padding=*/0> {
2272 static inline void PackMacroBlockIntrinsics(
2273 int32 height_block_number, int32 width_block_number,
2274 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2275 input_block_data,
2276 int8* scratch_block_data,
2277 const DepthwiseConvDotProdParams* function_params) {
2278 const int workspace_height_stride =
2279 function_params->workspace_height_stride;
2280 const int width_overall_micro_repeats =
2281 function_params->input_width_overall_micro_repeats;
2282 const int input_width_micro_repeats =
2283 function_params->input_width_micro_repeats;
2284 const int block_height = function_params->inbound_block_height;
2285 const int residual_width = function_params->residual_width;
2286 const int input_height_stride = function_params->input_height_stride;
2287
2288 TFLITE_DCHECK_EQ(function_params->padding_left, 0);
2289 TFLITE_DCHECK_EQ(function_params->padding_right, 0);
2290 TFLITE_DCHECK_EQ(function_params->padding_top, 0);
2291 TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
2292
2293 TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
2294
2295 // Work through one slice, by row, at a time.
2296 int8* scratch_data_base = scratch_block_data;
2297
2298 const int copy_block_height = block_height;
2299
2300 int adjusted_residual_width =
2301 input_width_micro_repeats < width_overall_micro_repeats ? residual_width
2302 : 4;
2303
2304 const int copy_size =
2305 (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
2306
2307 TFLITE_DCHECK_LE(
2308 copy_size,
2309 input_height_stride - width_block_number * input_width_micro_repeats);
2310 // We may drop up to stride-1 of trailing input.
2311 TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
2312
2313 int scratch_data_offset = 0;
2314 int input_block_offset = 0;
2315
2316 constexpr uint8 kSignBit =
2317 QuantizationTypeImpl<quantization_type>::kUint8SignBit;
2318
2319 // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
2320 // code. Note the blocks of 4x4 are still interleaved down the depth.
2321 int8x16_t work_reg;
2322 int8x8_t half_work_reg;
2323
2324 // Effect subtraction of zero-point = 128 by XOR of sign bit.
2325 const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
2326 half_work_reg = vdup_n_s8(0);
2327
2328 if (copy_size >= 16) {
2329 const int copy_remaining = copy_size & 0x7;
2330
2331 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2332 // Work through one slice, by row, at a time.
2333 int8* scratch_data = scratch_data_base + scratch_data_offset;
2334
2335 int copy_done = 0;
2336
2337 // Main copy loop.
2338 for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2339 work_reg =
2340 util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2341 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2342 work_reg = veorq_s8(work_reg, sign_bit);
2343 }
2344 TFLITE_DCHECK_EQ(copy_done % 16, 0);
2345 vst1q_s8(scratch_data + copy_done, work_reg);
2346 }
2347
2348 if (copy_done + 8 <= copy_size) {
2349 half_work_reg =
2350 util_vld1_x8(input_block_data + input_block_offset + copy_done);
2351 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2352 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2353 }
2354 TFLITE_DCHECK_EQ(copy_done % 8, 0);
2355 vst1_s8(scratch_data + copy_done, half_work_reg);
2356 copy_done += 8;
2357 }
2358
2359 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2360 // Total amount
2361 // = copy_size - copy_done + 4 - adjusted_residual_width
2362 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2363 // Undone micro blocks
2364 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2365
2366 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2367 if (copy_done < copy_size) {
2368 // Employ overlapping-load strategy in order to load full register,
2369 // but use only part.
2370 // This has the advantage of resulting in zeros after shifting.
2371 half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2372 copy_size - 8);
2373
2374 half_work_reg = vreinterpret_s8_s64(
2375 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2376 vdup_n_s64(-8 * (8 - copy_remaining))));
2377
2378 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2379 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2380 }
2381 TFLITE_DCHECK_EQ(copy_done % 8, 0);
2382 vst1_s8(scratch_data + copy_done, half_work_reg);
2383 copy_done += 8;
2384 }
2385
2386 // Trailing guard.
2387 vst1_s8(scratch_data + copy_done, half_work_reg);
2388 vst1_s8(scratch_data + copy_done + 8, half_work_reg);
2389
2390 scratch_data_offset += workspace_height_stride;
2391 input_block_offset += input_height_stride;
2392 }
2393 } else if (copy_size >= 4) {
2394 const int copy_remaining = copy_size & 0x3;
2395
2396 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2397 // Work through one slice, by row, at a time.
2398 int8* scratch_data = scratch_data_base + scratch_data_offset;
2399
2400 int copy_done = 0;
2401
2402 // Main copy loop.
2403 for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2404 half_work_reg =
2405 vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2406 half_work_reg, 0);
2407 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2408 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2409 }
2410 TFLITE_DCHECK_EQ(copy_done % 4, 0);
2411 vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
2412 }
2413
2414 TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2415 // Total amount
2416 // = copy_size - copy_done + 4 - adjusted_residual_width
2417 // = width_overall_micro_repeats * 4 - start_width - copy_done.
2418 // Undone micro blocks
2419 // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2420
2421 // Conditional is (copy_remaining > 0 || trailing_width_padding).
2422 if (copy_done < copy_size) {
2423 TFLITE_DCHECK_LT(copy_remaining, 4);
2424 // Employ overlapping-load strategy in order to load full register,
2425 // but use only part.
2426 // This has the advantage of resulting in zeros after shifting.
2427 half_work_reg = vld1_lane_8x4(
2428 input_block_data + input_block_offset + copy_size - 4,
2429 half_work_reg, 0);
2430
2431 half_work_reg = vreinterpret_s8_s64(
2432 vshl_s64(vreinterpret_s64_s8(half_work_reg),
2433 vdup_n_s64(-8 * (4 - copy_remaining))));
2434
2435 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2436 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2437 }
2438 TFLITE_DCHECK_EQ(copy_done % 4, 0);
2439 vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
2440 copy_done += 4;
2441 }
2442 // Trailing guard.
2443 vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
2444 vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
2445 vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
2446 vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
2447
2448 scratch_data_offset += workspace_height_stride;
2449 input_block_offset += input_height_stride;
2450 }
2451 } else {
2452 TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2453
2454 for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2455 for (int i = 0; i < copy_size; ++i) {
2456 half_work_reg = vreinterpret_s8_s64(
2457 vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2458 half_work_reg = vld1_lane_s8(
2459 reinterpret_cast<const int8*>(
2460 input_block_data + input_block_offset + copy_size - 1 - i),
2461 half_work_reg, 0);
2462 }
2463
2464 half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2465 TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2466 vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2467 0);
2468
2469 // Trailing guard.
2470 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
2471 half_work_reg, 0);
2472 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
2473 half_work_reg, 0);
2474 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
2475 half_work_reg, 0);
2476 vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
2477 half_work_reg, 0);
2478
2479 scratch_data_offset += workspace_height_stride;
2480 input_block_offset += input_height_stride;
2481 }
2482 }
2483
2484 scratch_data_base += copy_block_height * workspace_height_stride;
2485
2486 TFLITE_DCHECK_EQ(
2487 scratch_data_base,
2488 scratch_block_data + block_height * workspace_height_stride);
2489 }
2490
2491 static inline void Run(
2492 int32 height_block_number, int32 width_block_number,
2493 const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2494 input_block_data,
2495 int8* scratch_block_data,
2496 const DepthwiseConvDotProdParams* function_params) {
2497 #ifdef __aarch64__
2498 PreloadInputBlock(input_block_data, function_params);
2499 #endif
2500
2501 PackMacroBlockIntrinsics(height_block_number, width_block_number,
2502 input_block_data, scratch_block_data,
2503 function_params);
2504 }
2505 };
2506
2507 #endif // ARM NEON
2508
2509 // Apply filter to macro block of input data and store results.
2510 //
2511 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2512 template <int32 stride, QuantizationType quantization_type>
2513 struct KernelMacroBlock<
2514 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2515 DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2516 // Construct a width-shifted combination of two input sub-blocks, effectively
2517 // concatenating them.
2518 //
2519 // The filter is applied using sub-blocks. These are in the needed form for
2520 // the first (width) offset. For subsequent offsets, the filter is applied to
2521 // shifted and combined data. The concatentation and shifting herein is fairly
2522 // straightforward, but in the optimized code is an area of creativity in
2523 // design because NEON instructions do not directly support the required
2524 // between-register permutation.
2525 //
2526 // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2527 // move along the width for each output point calculation, data is shifted, in
2528 // essence between two such blocks.
2529 //
2530 // selected_data has format height 3, depth 4, width 4.
2531 //
2532 // When the micro block is trailing (the last across the macro-block width),
2533 // it would be illegal to load the right (next) block, and the no_right_block
2534 // indicates this scenario.
2535 static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
2536 int workspace_height_stride,
2537 int width_micro_stride,
2538 bool no_right_block,
2539 const int8* input_block,
2540 int8 selected_data[3][4][4]) {
2541 TFLITE_DCHECK_GE(offset, 0);
2542 TFLITE_DCHECK_LT(offset, 4);
2543
2544 // The input banks have same format as selected_data.
2545 int8 left_bank[3][4][4];
2546 int8 right_bank[3][4][4];
2547
2548 // Work through one slice, by row, at a time.
2549 for (int k_height = 0; k_height < 3; ++k_height) {
2550 // Simulate demangling of mangled storage arrangement.
2551 const int8* left_input_block =
2552 &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
2553 memcpy(left_bank[k_height][0], left_input_block, 16);
2554 if (no_right_block) {
2555 memset(right_bank[k_height][0], 0, 16);
2556 } else {
2557 const int8* right_input_block =
2558 &input_block[k_height * workspace_height_stride +
2559 sub_block * 2 * 8 + width_micro_stride];
2560 memcpy(right_bank[k_height][0], right_input_block, 16);
2561 }
2562 for (int depth_index = 0; depth_index < 4; ++depth_index) {
2563 memcpy(selected_data[k_height][depth_index],
2564 &left_bank[k_height][depth_index][offset], 4 - offset);
2565 memcpy(&selected_data[k_height][depth_index][4 - offset],
2566 right_bank[k_height][depth_index], offset);
2567 }
2568 }
2569 }
2570
2571 // Straight implementation of 3x3 filter within sub-micro block.
2572 static inline void Calculate3x3FilterOutput(
2573 const DepthwiseConvDotProdParams& params, int sub_block,
2574 const int8 selected_data[3][4][4], const int8 filter_bank[3][2][4][4],
2575 const int32* bias_data, uint8 output_values[4]) {
2576 const int32 output_activation_min = params.quantized_activation_min;
2577 const int32 output_activation_max = params.quantized_activation_max;
2578 const int32 output_multiplier = params.output_multiplier;
2579 const int32 output_shift = params.output_shift;
2580 const int32 output_offset = params.output_offset;
2581 for (int d = 0; d < 4; ++d) {
2582 int32 acc = 0;
2583 for (int y = 0; y < 3; ++y) {
2584 for (int x = 0; x < 4; ++x) {
2585 int32 input_val = selected_data[y][d][x];
2586 int32 filter_val = filter_bank[y][sub_block][d][x];
2587 acc += filter_val * input_val;
2588 }
2589 }
2590 acc += bias_data[d];
2591 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2592 DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2593 output_shift);
2594 acc += output_offset;
2595 acc = std::max(acc, output_activation_min);
2596 acc = std::min(acc, output_activation_max);
2597 output_values[d] = static_cast<uint8>(acc);
2598 }
2599 }
2600
2601 static inline void Run(const int8* scratch_block_data,
2602 const int8* filter_workspace, const int32* bias_data,
2603 uint8* output_block_data,
2604 const DepthwiseConvDotProdParams* function_params) {
2605 const int workspace_height_stride =
2606 function_params->workspace_height_stride;
2607 const int input_width_overall_micro_repeats =
2608 function_params->input_width_overall_micro_repeats;
2609 const int output_width_micro_repeats =
2610 function_params->output_width_micro_repeats;
2611 const int depth_micro_repeats = function_params->depth_micro_repeats;
2612 const int depth = function_params->input_depth;
2613 const int stride_val = function_params->stride;
2614 const int four_over_stride = function_params->four_over_stride;
2615
2616 const int output_width_overall_micro_repeats =
2617 function_params->output_width_overall_micro_repeats;
2618 const int block_height = function_params->outbound_block_height;
2619 const int residual_width = function_params->output_residual_width;
2620 const int output_height_stride = function_params->output_height_stride;
2621 constexpr int bias_increment = 4;
2622 TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2623
2624 TFLITE_DCHECK(depth_micro_repeats > 0);
2625 const int width_micro_stride = 4 * 8;
2626 const int depth_micro_stride =
2627 width_micro_stride * input_width_overall_micro_repeats;
2628
2629 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2630
2631 // Simulate NEON-register transposition of subset of filter.
2632 int8 filter_bank[3][2][4][4]; // Height 3, sub-block, depth 4, width 4.
2633 // Simulate NEON-register input data concatenation + sub-selection.
2634 int8 sub_selected_input_data[3][4][4]; // Height 3, depth 4, width 4.
2635 uint8 output_values[4]; // Depth 4.
2636
2637 // The outer 3 loops go through all the micro blocks in a macro block, and
2638 // separately treat the two sub-blocks within each micro block.
2639 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2640 memcpy(filter_bank[0][0][0],
2641 filter_workspace + j_depth * shuffled_filter_increment,
2642 shuffled_filter_increment);
2643
2644 for (int s = 0; s < 2; ++s) {
2645 for (int k_height = 0; k_height < block_height; ++k_height) {
2646 const int8* scratch_data =
2647 scratch_block_data +
2648 workspace_height_stride * k_height * stride_val +
2649 depth_micro_stride * j_depth;
2650 uint8* output_data =
2651 output_block_data + output_height_stride * k_height + 8 * j_depth;
2652
2653 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2654 ++i_width) {
2655 const int output_width = i_width == output_width_micro_repeats
2656 ? residual_width
2657 : four_over_stride;
2658 const bool no_right_block = (output_width - 1) * stride_val < 2;
2659 TFLITE_DCHECK_LE(output_width * stride_val, 4);
2660 const int8* input_data =
2661 scratch_data + width_micro_stride * i_width;
2662 // Iterate over input width shifts within sub-micro blocks.
2663 for (int x = 0; x < output_width; ++x) {
2664 ConcatenateInputSubBlocks(x * stride_val, s,
2665 workspace_height_stride,
2666 width_micro_stride, no_right_block,
2667 input_data, sub_selected_input_data);
2668 Calculate3x3FilterOutput(
2669 *function_params, s, sub_selected_input_data, filter_bank,
2670 bias_data + (2 * j_depth + s) * bias_increment,
2671 output_values);
2672 for (int d = 0; d < 4; ++d) {
2673 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2674 d] = output_values[d];
2675 }
2676 }
2677 }
2678 }
2679 }
2680 }
2681 }
2682 };
2683
2684 // Apply filter to macro block of input data and store results.
2685 //
2686 // Parameters for repeats and residual sizes are in terms of outputs.
2687 //
2688 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2689 template <int32 stride, QuantizationType quantization_type>
2690 struct KernelMacroBlock<
2691 DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2692 DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
2693 // Construct a width-shifted combination of two input sub-blocks, effectively
2694 // concatenating them.
2695 //
2696 // The filter is applied using sub-blocks. These are in the needed form for
2697 // the first (width) offset. For subsequent offsets, the filter is applied to
2698 // shifted and combined data. The concatentation and shifting herein is fairly
2699 // straightforward, but in the optimized code is an area of creativity in
2700 // design because NEON instructions do not directly support the required
2701 // between-register permutation.
2702 //
2703 // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2704 // move along the width for each output point calculation, data is shifted, in
2705 // essence between two such blocks.
2706 //
2707 // selected_data has format height 3, width 4.
2708 //
2709 // When the micro block is trailing (the last across the macro-block width),
2710 // it would be illegal to load the right (next) block, and the no_right_block
2711 // indicates this scenario.
2712 static inline void ConcatenateInputSubBlocks(int offset,
2713 int workspace_height_stride,
2714 bool no_right_block,
2715 const int8* input_block,
2716 int8 selected_data[3][4]) {
2717 TFLITE_DCHECK_GE(offset, 0);
2718 TFLITE_DCHECK_LT(offset, 4);
2719 if (no_right_block) {
2720 for (int k_height = 0; k_height < 3; ++k_height) {
2721 memcpy(selected_data[k_height],
2722 &input_block[k_height * workspace_height_stride + offset],
2723 4 - offset);
2724 }
2725 } else {
2726 for (int k_height = 0; k_height < 3; ++k_height) {
2727 memcpy(selected_data[k_height],
2728 &input_block[k_height * workspace_height_stride + offset], 4);
2729 }
2730 }
2731 }
2732
2733 // Straight implementation of 3x3 filter within sub-micro block.
2734 static inline void Calculate3x3FilterOutput(
2735 const DepthwiseConvDotProdParams& function_params, int sub_block,
2736 const int8 selected_data[3][4], const int8 filter_bank[3][2][4][4],
2737 const int32* bias_data, uint8 output_values[4]) {
2738 const int32 output_activation_min =
2739 function_params.quantized_activation_min;
2740 const int32 output_activation_max =
2741 function_params.quantized_activation_max;
2742 const int32 output_multiplier = function_params.output_multiplier;
2743 const int32 output_shift = function_params.output_shift;
2744 const int32 output_offset = function_params.output_offset;
2745 for (int d = 0; d < 4; ++d) {
2746 int32 acc = 0;
2747 for (int y = 0; y < 3; ++y) {
2748 for (int x = 0; x < 4; ++x) {
2749 int32 input_val = selected_data[y][x];
2750 int32 filter_val = filter_bank[y][sub_block][d][x];
2751 acc += filter_val * input_val;
2752 }
2753 }
2754 acc += bias_data[d];
2755 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2756 DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2757 output_shift);
2758 acc += output_offset;
2759 acc = std::max(acc, output_activation_min);
2760 acc = std::min(acc, output_activation_max);
2761 output_values[d] = static_cast<uint8>(acc);
2762 }
2763 }
2764
2765 static inline void Run(const int8* scratch_block_data,
2766 const int8* filter_workspace, const int32* bias_data,
2767 uint8* output_block_data,
2768 const DepthwiseConvDotProdParams* function_params) {
2769 const int workspace_height_stride =
2770 function_params->workspace_height_stride;
2771 const int output_width_micro_repeats =
2772 function_params->output_width_micro_repeats;
2773 const int depth_micro_repeats = function_params->depth_micro_repeats;
2774 const int depth = function_params->output_depth;
2775 const int stride_val = function_params->stride;
2776 const int four_over_stride = function_params->four_over_stride;
2777
2778 const int workspace_width_micro_repeats =
2779 function_params->workspace_width_micro_repeats;
2780 const int output_width_overall_micro_repeats =
2781 function_params->output_width_overall_micro_repeats;
2782 const int block_height = function_params->outbound_block_height;
2783 const int residual_width = function_params->output_residual_width;
2784 const int output_height_stride = function_params->output_height_stride;
2785 constexpr int bias_increment = 4;
2786 TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2787
2788 TFLITE_DCHECK(depth_micro_repeats > 0);
2789
2790 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2791
2792 // Simulate NEON-register transposition of subset of filter.
2793 int8 filter_bank[3][2][4][4]; // Height 3, sub-block, depth 4, width 4.
2794 // Simulate NEON-register input data concatenation + sub-selection.
2795 int8 sub_selected_input_data[3][4]; // Height 3, depth 4, width 4.
2796 uint8 output_values[4]; // Depth 4.
2797
2798 // The outer 3 loops go through all the micro blocks in a macro block, and
2799 // separately treat the two sub-blocks within each micro block.
2800 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2801 memcpy(filter_bank[0][0][0],
2802 filter_workspace + j_depth * shuffled_filter_increment,
2803 shuffled_filter_increment);
2804
2805 for (int s = 0; s < 2; ++s) {
2806 for (int k_height = 0; k_height < block_height; ++k_height) {
2807 const int8* scratch_data =
2808 scratch_block_data +
2809 workspace_height_stride * k_height * stride_val;
2810 uint8* output_data =
2811 output_block_data + output_height_stride * k_height + 8 * j_depth;
2812
2813 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2814 ++i_width) {
2815 const int output_width = i_width == output_width_micro_repeats
2816 ? residual_width
2817 : four_over_stride;
2818 const bool no_right_block = i_width == output_width_micro_repeats &&
2819 output_width_overall_micro_repeats ==
2820 workspace_width_micro_repeats;
2821 TFLITE_DCHECK_LE(output_width * stride_val, 4);
2822 const int8* input_data = scratch_data + 4 * i_width;
2823 // Iterate over input width shifts within 4x4 blocks.
2824 for (int x = 0; x < output_width; ++x) {
2825 ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
2826 no_right_block, input_data,
2827 sub_selected_input_data);
2828 Calculate3x3FilterOutput(
2829 *function_params, s, sub_selected_input_data, filter_bank,
2830 bias_data + (2 * j_depth + s) * bias_increment,
2831 output_values);
2832 for (int d = 0; d < 4; ++d) {
2833 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2834 d] = output_values[d];
2835 }
2836 }
2837 }
2838 }
2839 }
2840 }
2841 }
2842 };
2843
2844 // Beginning of code section containing intermediate code transformation.
2845 //
2846 // This section is only compiled when kUseUnwound3x3DotProduct versions of
2847 // templated functions are selected.
2848 template <int32 stride, QuantizationType quantization_type>
2849 struct KernelMacroBlock<
2850 DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
2851 DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2852 static inline void Run(const int8* scratch_block_data,
2853 const int8* filter_workspace, const int32* bias_data,
2854 uint8* output_block_data,
2855 const DepthwiseConvDotProdParams* function_params) {
2856 const int workspace_height_stride =
2857 function_params->workspace_height_stride;
2858 const int input_width_overall_micro_repeats =
2859 function_params->input_width_overall_micro_repeats;
2860 const int output_width_micro_repeats =
2861 function_params->output_width_micro_repeats;
2862 const int depth_micro_repeats = function_params->depth_micro_repeats;
2863 const int depth = function_params->input_depth;
2864 const int stride_val = function_params->stride;
2865 const int four_over_stride = function_params->four_over_stride;
2866
2867 const int output_width_overall_micro_repeats =
2868 function_params->output_width_overall_micro_repeats;
2869 const int block_height = function_params->outbound_block_height;
2870 const int residual_width = function_params->output_residual_width;
2871 const int output_height_stride = function_params->output_height_stride;
2872 const int bias_increment = function_params->bias_increment;
2873
2874 TFLITE_DCHECK(depth_micro_repeats > 0);
2875 const int width_micro_stride = 4 * 8;
2876 const int depth_micro_stride =
2877 width_micro_stride * input_width_overall_micro_repeats;
2878
2879 const int32 output_activation_min =
2880 function_params->quantized_activation_min;
2881 const int32 output_activation_max =
2882 function_params->quantized_activation_max;
2883 const int32 output_multiplier = function_params->output_multiplier;
2884 const int32 output_shift = function_params->output_shift;
2885 const int32 output_offset = function_params->output_offset;
2886
2887 // Simulate NEON-register transposition of subset of filter.
2888 int8 filter_bank_a_0[4][4]; // Depth 4, width 4.
2889 int8 filter_bank_a_1[4][4];
2890 int8 filter_bank_a_2[4][4];
2891 int8 filter_bank_b_0[4][4];
2892 int8 filter_bank_b_1[4][4];
2893 int8 filter_bank_b_2[4][4];
2894 // Simulate NEON-register input data concatenation + sub-selection.
2895 // Also sub-block, height 3, depth 4, width 4.
2896 uint8 output_values[4]; // Sub-block, depth 4.
2897 // selected_data has format Depth 4, width 4.
2898 int8 left_bank_0[4][4];
2899 int8 left_bank_1[4][4];
2900 int8 left_bank_2[4][4];
2901 int8 right_bank_0[4][4];
2902 int8 right_bank_1[4][4];
2903 int8 right_bank_2[4][4];
2904 memset(right_bank_0[0], 0, 16);
2905 memset(right_bank_1[0], 0, 16);
2906 memset(right_bank_2[0], 0, 16);
2907
2908 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2909
2910 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2911 const int8* filter_block =
2912 filter_workspace + shuffled_filter_increment * j_depth;
2913
2914 memcpy(filter_bank_a_0, filter_block, 16);
2915 memcpy(filter_bank_b_0, filter_block + 16, 16);
2916 memcpy(filter_bank_a_1, filter_block + 32, 16);
2917 memcpy(filter_bank_b_1, filter_block + 48, 16);
2918 memcpy(filter_bank_a_2, filter_block + 64, 16);
2919 memcpy(filter_bank_b_2, filter_block + 80, 16);
2920
2921 for (int s = 0; s < 2; ++s) {
2922 // Work through one slice, by row, at a time.
2923 for (int k_height = 0; k_height < block_height; ++k_height) {
2924 const int8* scratch_data =
2925 scratch_block_data +
2926 workspace_height_stride * k_height * stride_val +
2927 depth_micro_stride * j_depth;
2928 uint8* output_data =
2929 output_block_data + output_height_stride * k_height + 8 * j_depth;
2930 const int8* input_data_0 = scratch_data + s * 2 * 8;
2931
2932 // Load first sub-micro block of data into operational banks.
2933 memcpy(left_bank_0[0], input_data_0, 16);
2934 memcpy(left_bank_1[0], input_data_0 + workspace_height_stride, 16);
2935 memcpy(left_bank_2[0], input_data_0 + 2 * workspace_height_stride,
2936 16);
2937
2938 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2939 ++i_width) {
2940 const int output_width = i_width == output_width_micro_repeats
2941 ? residual_width
2942 : four_over_stride;
2943 TFLITE_DCHECK_LE(output_width * stride_val, 4);
2944 const int8* input_data =
2945 input_data_0 + width_micro_stride * i_width;
2946 const bool no_right_block = (output_width - 1) * stride_val < 2;
2947
2948 // Load next sub-micro block of data.
2949 if (!no_right_block) {
2950 memcpy(right_bank_0[0], input_data + width_micro_stride, 16);
2951 memcpy(right_bank_1[0],
2952 input_data + workspace_height_stride + width_micro_stride,
2953 16);
2954 memcpy(
2955 right_bank_2[0],
2956 input_data + 2 * workspace_height_stride + width_micro_stride,
2957 16);
2958 }
2959
2960 // Iterate over input width shifts within 4x4 blocks.
2961 for (int x = 0; x < output_width; ++x) {
2962 // Operate on depth of 4 in batches.
2963 for (int d = 0; d < 4; ++d) {
2964 int32 acc = 0;
2965 for (int x = 0; x < 4; ++x) {
2966 int32 input_val = left_bank_0[d][x];
2967 int32 filter_val = filter_bank_a_0[d][x];
2968 acc += filter_val * input_val;
2969 }
2970 for (int x = 0; x < 4; ++x) {
2971 int32 input_val = left_bank_1[d][x];
2972 int32 filter_val = filter_bank_a_1[d][x];
2973 acc += filter_val * input_val;
2974 }
2975 for (int x = 0; x < 4; ++x) {
2976 int32 input_val = left_bank_2[d][x];
2977 int32 filter_val = filter_bank_a_2[d][x];
2978 acc += filter_val * input_val;
2979 }
2980 acc += bias_data[d];
2981 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2982 DepthwiseConvOutputRounding::kUpward>(
2983 acc, output_multiplier, output_shift);
2984 acc += output_offset;
2985 acc = std::max(acc, output_activation_min);
2986 acc = std::min(acc, output_activation_max);
2987 output_values[d] = static_cast<uint8>(acc);
2988 }
2989
2990 for (int d = 0; d < 4; ++d) {
2991 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2992 d] = output_values[d];
2993 }
2994
2995 // Simulate shifting instructions.
2996 if (stride_val == 1) {
2997 for (int depth_index = 0; depth_index < 4; ++depth_index) {
2998 for (int z = 0; z < 3; ++z) {
2999 left_bank_0[depth_index][z] =
3000 left_bank_0[depth_index][z + 1];
3001 left_bank_1[depth_index][z] =
3002 left_bank_1[depth_index][z + 1];
3003 left_bank_2[depth_index][z] =
3004 left_bank_2[depth_index][z + 1];
3005 }
3006 left_bank_0[depth_index][3] = right_bank_0[depth_index][0];
3007 left_bank_1[depth_index][3] = right_bank_1[depth_index][0];
3008 left_bank_2[depth_index][3] = right_bank_2[depth_index][0];
3009 for (int z = 0; z < 3; ++z) {
3010 right_bank_0[depth_index][z] =
3011 right_bank_0[depth_index][z + 1];
3012 right_bank_1[depth_index][z] =
3013 right_bank_1[depth_index][z + 1];
3014 right_bank_2[depth_index][z] =
3015 right_bank_2[depth_index][z + 1];
3016 }
3017 }
3018 } else {
3019 for (int depth_index = 0; depth_index < 4; ++depth_index) {
3020 for (int z = 0; z < 2; ++z) {
3021 left_bank_0[depth_index][z] =
3022 left_bank_0[depth_index][z + 2];
3023 left_bank_1[depth_index][z] =
3024 left_bank_1[depth_index][z + 2];
3025 left_bank_2[depth_index][z] =
3026 left_bank_2[depth_index][z + 2];
3027 }
3028 left_bank_0[depth_index][2] = right_bank_0[depth_index][0];
3029 left_bank_1[depth_index][2] = right_bank_1[depth_index][0];
3030 left_bank_2[depth_index][2] = right_bank_2[depth_index][0];
3031 left_bank_0[depth_index][3] = right_bank_0[depth_index][1];
3032 left_bank_1[depth_index][3] = right_bank_1[depth_index][1];
3033 left_bank_2[depth_index][3] = right_bank_2[depth_index][1];
3034 for (int z = 0; z < 2; ++z) {
3035 right_bank_0[depth_index][z] =
3036 right_bank_0[depth_index][z + 2];
3037 right_bank_1[depth_index][z] =
3038 right_bank_1[depth_index][z + 2];
3039 right_bank_2[depth_index][z] =
3040 right_bank_2[depth_index][z + 2];
3041 }
3042 }
3043 }
3044 }
3045 }
3046 }
3047 bias_data += bias_increment;
3048
3049 // Move filter for second sub-block into operational filter.
3050 for (int z = 0; z < 4; ++z) {
3051 for (int x = 0; x < 4; ++x) {
3052 filter_bank_a_0[z][x] = filter_bank_b_0[z][x];
3053 filter_bank_a_1[z][x] = filter_bank_b_1[z][x];
3054 filter_bank_a_2[z][x] = filter_bank_b_2[z][x];
3055 }
3056 }
3057 }
3058 }
3059 }
3060 };
3061
3062 template <int32 stride, QuantizationType quantization_type>
3063 struct KernelMacroBlock<
3064 DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
3065 DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
3066 static inline void Run(const int8* scratch_block_data,
3067 const int8* filter_workspace, const int32* bias_data,
3068 uint8* output_block_data,
3069 const DepthwiseConvDotProdParams* function_params) {
3070 const int workspace_height_stride =
3071 function_params->workspace_height_stride;
3072 const int output_width_micro_repeats =
3073 function_params->output_width_micro_repeats;
3074 const int depth_micro_repeats = function_params->depth_micro_repeats;
3075 const int output_depth = function_params->output_depth;
3076 const int stride_val = function_params->stride;
3077 const int four_over_stride = function_params->four_over_stride;
3078
3079 const int output_width_overall_micro_repeats =
3080 function_params->output_width_overall_micro_repeats;
3081 const int block_height = function_params->outbound_block_height;
3082 const int residual_width = function_params->output_residual_width;
3083 const int output_height_stride = function_params->output_height_stride;
3084 const int bias_increment = function_params->bias_increment;
3085
3086 const int32 output_activation_min =
3087 function_params->quantized_activation_min;
3088 const int32 output_activation_max =
3089 function_params->quantized_activation_max;
3090 const int32 output_multiplier = function_params->output_multiplier;
3091 const int32 output_shift = function_params->output_shift;
3092 const int32 output_offset = function_params->output_offset;
3093
3094 TFLITE_DCHECK(depth_micro_repeats > 0);
3095
3096 TFLITE_DCHECK_EQ(bias_increment, 4);
3097
3098 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3099
3100 // Simulate NEON-register transposition of subset of filter.
3101 int8 filter_bank_a_0[4][4]; // Depth 4, width 4.
3102 int8 filter_bank_a_1[4][4];
3103 int8 filter_bank_a_2[4][4];
3104 int8 filter_bank_b_0[4][4];
3105 int8 filter_bank_b_1[4][4];
3106 int8 filter_bank_b_2[4][4];
3107 // Simulate NEON-register input data concatenation + sub-selection.
3108 // Also sub-block, height 3, depth 4, width 4.
3109
3110 int8 input_bank_0[8];
3111 int8 input_bank_1[8];
3112 int8 input_bank_2[8];
3113
3114 TFLITE_DCHECK_GE(depth_micro_repeats, 1);
3115
3116 uint8 output_values[2][4]; // Sub-block, depth 4.
3117
3118 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3119 memcpy(filter_bank_a_0, filter_workspace, 16);
3120 memcpy(filter_bank_b_0, filter_workspace + 16, 16);
3121 memcpy(filter_bank_a_1, filter_workspace + 32, 16);
3122 memcpy(filter_bank_b_1, filter_workspace + 48, 16);
3123 memcpy(filter_bank_a_2, filter_workspace + 64, 16);
3124 memcpy(filter_bank_b_2, filter_workspace + 80, 16);
3125
3126 // Work through one slice, by row, at a time.
3127 for (int k_height = 0; k_height < block_height; ++k_height) {
3128 const int8* scratch_data =
3129 scratch_block_data +
3130 workspace_height_stride * k_height * stride_val;
3131 uint8* output_data =
3132 output_block_data + output_height_stride * k_height + 8 * j_depth;
3133
3134 memcpy(input_bank_0, scratch_data, 4);
3135 memcpy(input_bank_1, scratch_data + workspace_height_stride, 4);
3136 memcpy(input_bank_2, scratch_data + 2 * workspace_height_stride, 4);
3137
3138 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3139 ++i_width) {
3140 const int output_width = i_width == output_width_micro_repeats
3141 ? residual_width
3142 : four_over_stride;
3143
3144 TFLITE_DCHECK_LE(output_width * stride_val, 4);
3145 const int8* input_data = scratch_data + 4 * i_width;
3146
3147 memcpy(input_bank_0 + 4, input_data + 4, 4);
3148 memcpy(input_bank_1 + 4, input_data + workspace_height_stride + 4, 4);
3149 memcpy(input_bank_2 + 4, input_data + 2 * workspace_height_stride + 4,
3150 4);
3151
3152 // Iterate over input width shifts within 4x4 blocks.
3153 for (int w = 0; w < output_width; ++w) {
3154 constexpr int offset =
3155 0; // Shift input instead of offset in multiply-accumulate.
3156
3157 {
3158 const int s = 0;
3159 for (int d = 0; d < 4; ++d) {
3160 int32 acc = bias_data[s * 4 + d];
3161 for (int x = 0; x < 4; ++x) {
3162 int32 input_val_0 = input_bank_0[offset + x];
3163 int32 filter_val_0 = filter_bank_a_0[d][x];
3164 acc += filter_val_0 * input_val_0;
3165 int32 input_val_1 = input_bank_1[offset + x];
3166 int32 filter_val_1 = filter_bank_a_1[d][x];
3167 acc += filter_val_1 * input_val_1;
3168 int32 input_val_2 = input_bank_2[offset + x];
3169 int32 filter_val_2 = filter_bank_a_2[d][x];
3170 acc += filter_val_2 * input_val_2;
3171 }
3172 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3173 DepthwiseConvOutputRounding::kUpward>(
3174 acc, output_multiplier, output_shift);
3175 acc += output_offset;
3176 acc = std::max(acc, output_activation_min);
3177 acc = std::min(acc, output_activation_max);
3178 output_values[s][d] = static_cast<uint8>(acc);
3179
3180 output_data[s * 4 + d] = output_values[s][d];
3181 }
3182 }
3183 {
3184 const int s = 1;
3185 for (int d = 0; d < 4; ++d) {
3186 int32 acc = bias_data[s * 4 + d];
3187 for (int x = 0; x < 4; ++x) {
3188 int32 input_val_0 = input_bank_0[offset + x];
3189 int32 filter_val_0 = filter_bank_b_0[d][x];
3190 acc += filter_val_0 * input_val_0;
3191 int32 input_val_1 = input_bank_1[offset + x];
3192 int32 filter_val_1 = filter_bank_b_1[d][x];
3193 acc += filter_val_1 * input_val_1;
3194 int32 input_val_2 = input_bank_2[offset + x];
3195 int32 filter_val_2 = filter_bank_b_2[d][x];
3196 acc += filter_val_2 * input_val_2;
3197 }
3198 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3199 DepthwiseConvOutputRounding::kUpward>(
3200 acc, output_multiplier, output_shift);
3201 acc += output_offset;
3202 acc = std::max(acc, output_activation_min);
3203 acc = std::min(acc, output_activation_max);
3204 output_values[s][d] = static_cast<uint8>(acc);
3205
3206 output_data[s * 4 + d] = output_values[s][d];
3207 }
3208 }
3209
3210 // Simulate register shifts.
3211 for (int i = 0; i < (8 - stride_val); ++i) {
3212 input_bank_0[i] = input_bank_0[i + stride_val];
3213 input_bank_1[i] = input_bank_1[i + stride_val];
3214 input_bank_2[i] = input_bank_2[i + stride_val];
3215 }
3216
3217 output_data += output_depth;
3218 }
3219 }
3220 }
3221 bias_data += 2 * bias_increment;
3222 filter_workspace += shuffled_filter_increment;
3223 }
3224 }
3225 };
3226 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
3227 // of templated functions are selected.
3228 //
3229 // End of code section containing intermediate code transformation.
3230
3231 #ifdef USE_NEON
3232 template <>
3233 struct KernelMacroBlock<
3234 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3235 QuantizationType::kNonPerChannelUint8,
3236 DepthwiseConvDepthMultiplication::kNoMultiplication,
3237 /*stride=*/1> {
3238 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3239 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3240 return vmin_u8(a, b);
3241 }
3242 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3243 return vmax_u8(a, b);
3244 }
3245 static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
3246 return vminq_u8(a, b);
3247 }
3248 static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
3249 return vmaxq_u8(a, b);
3250 }
3251
3252 static inline void KernelMacroBlockIntrinsics(
3253 const int8* scratch_block_data, const int8* filter_workspace,
3254 const int32* bias_data, uint8* output_block_data,
3255 const DepthwiseConvDotProdParams* function_params) {
3256 static constexpr QuantizationType quantization_type =
3257 QuantizationType::kNonPerChannelUint8;
3258
3259 const int workspace_height_stride =
3260 function_params->workspace_height_stride;
3261 const int input_width_overall_micro_repeats =
3262 function_params->input_width_overall_micro_repeats;
3263 const int output_width_micro_repeats =
3264 function_params->output_width_micro_repeats;
3265 const int depth_micro_repeats = function_params->depth_micro_repeats;
3266 const int depth = function_params->input_depth;
3267
3268 const int output_width_overall_micro_repeats =
3269 function_params->output_width_overall_micro_repeats;
3270 const int block_height = function_params->outbound_block_height;
3271 const int residual_width = function_params->output_residual_width;
3272 const int output_height_stride = function_params->output_height_stride;
3273 constexpr int kBiasIncrement = 4;
3274
3275 TFLITE_DCHECK(depth_micro_repeats > 0);
3276 const int width_micro_stride = 4 * 8;
3277 const int depth_micro_stride =
3278 width_micro_stride * input_width_overall_micro_repeats;
3279
3280 const int32 output_activation_min =
3281 function_params->quantized_activation_min;
3282 const int32 output_activation_max =
3283 function_params->quantized_activation_max;
3284 const int32 output_multiplier = function_params->output_multiplier;
3285 const int32 output_shift = function_params->output_shift;
3286 const int32 output_offset = function_params->output_offset;
3287 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3288 TFLITE_DCHECK_GE(output_activation_min, 0);
3289 TFLITE_DCHECK_LT(output_activation_min, 256);
3290 TFLITE_DCHECK_GE(output_activation_max, 0);
3291 TFLITE_DCHECK_LT(output_activation_max, 256);
3292 } else {
3293 TFLITE_DCHECK_GE(output_activation_min, -128);
3294 TFLITE_DCHECK_LT(output_activation_min, 128);
3295 TFLITE_DCHECK_GE(output_activation_max, -128);
3296 TFLITE_DCHECK_LT(output_activation_max, 128);
3297 }
3298 TFLITE_DCHECK_GE(output_offset, -32878);
3299 TFLITE_DCHECK_LT(output_offset, 32768);
3300
3301 const int16x8_t output_offset_vec =
3302 vdupq_n_s16(static_cast<int16>(output_offset));
3303 const uint8x16_t output_activation_min_vec =
3304 vdupq_n_u8(static_cast<uint8>(output_activation_min));
3305 const uint8x16_t output_activation_max_vec =
3306 vdupq_n_u8(static_cast<uint8>(output_activation_max));
3307
3308 const int8* input_data_depthwise = scratch_block_data;
3309 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3310 output_data_depthwise = output_block_data;
3311 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3312 // Simulate NEON-register transposition of subset of filter.
3313 int8x16_t filter_reg_0_a;
3314 int8x16_t filter_reg_0_b;
3315 int8x16_t filter_reg_1_a;
3316 int8x16_t filter_reg_1_b;
3317 int8x16_t filter_reg_2_a;
3318 int8x16_t filter_reg_2_b;
3319 int8x16_t filter_reg_0_a_shifted;
3320 int8x16_t filter_reg_1_a_shifted;
3321 int8x16_t filter_reg_2_a_shifted;
3322
3323 filter_reg_0_a = vld1q_s8(filter_workspace);
3324 filter_workspace += 16;
3325 filter_reg_0_b = vld1q_s8(filter_workspace);
3326 filter_workspace += 16;
3327 filter_reg_1_a = vld1q_s8(filter_workspace);
3328 filter_workspace += 16;
3329 filter_reg_1_b = vld1q_s8(filter_workspace);
3330 filter_workspace += 16;
3331 filter_reg_2_a = vld1q_s8(filter_workspace);
3332 filter_workspace += 16;
3333 filter_reg_2_b = vld1q_s8(filter_workspace);
3334 filter_workspace += 16;
3335
3336 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
3337 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
3338 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
3339
3340 if (block_height == 4) {
3341 for (int s = 0; s < 2; ++s) {
3342 // Work through one slice, by row, at a time.
3343 const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
3344 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3345 output_data_base = output_data_depthwise + 4 * s;
3346
3347 const int8* next_input_data = input_data_base;
3348 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3349 output_data = output_data_base;
3350
3351 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
3352 bias_data += kBiasIncrement;
3353
3354 // Load first sub-micro block of data into operational banks.
3355 int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
3356 int8x16_t left_bank_1_reg =
3357 vld1q_s8(next_input_data + workspace_height_stride);
3358 int8x16_t left_bank_2_reg =
3359 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3360 int8x16_t left_bank_3_reg =
3361 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3362 int8x16_t left_bank_4_reg =
3363 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3364 int8x16_t left_bank_5_reg =
3365 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3366
3367 int32x4_t acc0;
3368 int32x4_t acc1;
3369 int32x4_t acc2;
3370 int32x4_t acc3;
3371
3372 acc0 = adjusted_bias_data;
3373 acc1 = adjusted_bias_data;
3374 acc2 = adjusted_bias_data;
3375 acc3 = adjusted_bias_data;
3376
3377 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3378 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3379 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3380 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3381
3382 for (int i_width = 0; i_width < output_width_micro_repeats;
3383 ++i_width) {
3384 next_input_data += width_micro_stride;
3385
3386 // Iterate over input width shifts within 4x4 blocks.
3387 {
3388 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3389 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3390 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3391 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3392 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3393 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3394 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3395 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3396
3397 // Fixed-point multiplication.
3398 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3399 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3400 acc0, -output_shift);
3401 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3402 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3403 acc1, -output_shift);
3404 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3405 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3406 acc2, -output_shift);
3407 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3408 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3409 acc3, -output_shift);
3410 // Add the output offset.
3411 int16x8_t acc_s16_0_1 =
3412 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3413 int16x8_t acc_s16_2_3 =
3414 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3415 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3416 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3417 // Apply the activation function.
3418 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3419 vqmovxn_s16(acc_s16_2_3));
3420 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3421 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3422
3423 vst1q_lane_8x4(output_data, acc_u8_all, 0);
3424 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3425 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3426 2);
3427 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3428 3);
3429
3430 output_data += depth;
3431 }
3432
3433 // Load next sub-micro block of data.
3434 int8x16_t right_bank_0_reg;
3435 int8x16_t right_bank_1_reg;
3436 int8x16_t right_bank_2_reg;
3437 int8x16_t right_bank_3_reg;
3438 int8x16_t right_bank_4_reg;
3439 int8x16_t right_bank_5_reg;
3440
3441 // Loading of next block always valid.
3442 right_bank_0_reg = vld1q_s8(next_input_data);
3443 right_bank_1_reg =
3444 vld1q_s8(next_input_data + workspace_height_stride);
3445 right_bank_2_reg =
3446 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3447 right_bank_3_reg =
3448 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3449 right_bank_4_reg =
3450 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3451 right_bank_5_reg =
3452 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3453
3454 {
3455 acc0 = adjusted_bias_data;
3456 acc1 = adjusted_bias_data;
3457 acc2 = adjusted_bias_data;
3458 acc3 = adjusted_bias_data;
3459
3460 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3461 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3462 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3463 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3464 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3465 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3466 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3467 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3468 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3469 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3470 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3471 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3472
3473 // Fixed-point multiplication.
3474 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3475 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3476 acc0, -output_shift);
3477 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3478 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3479 acc1, -output_shift);
3480 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3481 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3482 acc2, -output_shift);
3483 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3484 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3485 acc3, -output_shift);
3486 // Add the output offset.
3487 int16x8_t acc_s16_0_1 =
3488 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3489 int16x8_t acc_s16_2_3 =
3490 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3491 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3492 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3493 // Apply the activation function.
3494 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3495 vqmovxn_s16(acc_s16_2_3));
3496 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3497 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3498
3499 vst1q_lane_8x4(output_data, acc_u8_all, 0);
3500 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3501 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3502 2);
3503 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3504 3);
3505
3506 left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
3507 left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
3508 left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
3509 left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
3510 left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
3511 left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
3512 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
3513 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
3514 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
3515 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
3516 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
3517 vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
3518
3519 output_data += depth;
3520 }
3521
3522 {
3523 acc0 = adjusted_bias_data;
3524 acc1 = adjusted_bias_data;
3525 acc2 = adjusted_bias_data;
3526 acc3 = adjusted_bias_data;
3527
3528 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3529 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3530 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3531 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3532 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3533 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3534 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3535 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3536 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3537 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3538 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3539 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3540
3541 // Fixed-point multiplication.
3542 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3543 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3544 acc0, -output_shift);
3545 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3546 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3547 acc1, -output_shift);
3548 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3549 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3550 acc2, -output_shift);
3551 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3552 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3553 acc3, -output_shift);
3554 // Add the output offset.
3555 int16x8_t acc_s16_0_1 =
3556 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3557 int16x8_t acc_s16_2_3 =
3558 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3559 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3560 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3561 // Apply the activation function.
3562 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3563 vqmovxn_s16(acc_s16_2_3));
3564 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3565 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3566
3567 vst1q_lane_8x4(output_data, acc_u8_all, 0);
3568 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3569 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3570 2);
3571 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3572 3);
3573
3574 output_data += depth;
3575 }
3576
3577 {
3578 acc0 = adjusted_bias_data;
3579 acc1 = adjusted_bias_data;
3580 acc2 = adjusted_bias_data;
3581 acc3 = adjusted_bias_data;
3582
3583 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3584 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3585 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3586 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3587 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3588 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3589 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3590 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3591 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3592 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3593 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3594 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3595
3596 // Fixed-point multiplication.
3597 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3598 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3599 acc0, -output_shift);
3600 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3601 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3602 acc1, -output_shift);
3603 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3604 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3605 acc2, -output_shift);
3606 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3607 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3608 acc3, -output_shift);
3609 // Add the output offset.
3610 int16x8_t acc_s16_0_1 =
3611 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3612 int16x8_t acc_s16_2_3 =
3613 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3614 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3615 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3616 // Apply the activation function.
3617 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3618 vqmovxn_s16(acc_s16_2_3));
3619 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3620 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3621
3622 vst1q_lane_8x4(output_data, acc_u8_all, 0);
3623 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3624 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3625 2);
3626 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3627 3);
3628
3629 left_bank_0_reg = right_bank_0_reg;
3630 left_bank_1_reg = right_bank_1_reg;
3631 left_bank_2_reg = right_bank_2_reg;
3632 left_bank_3_reg = right_bank_3_reg;
3633 left_bank_4_reg = right_bank_4_reg;
3634 left_bank_5_reg = right_bank_5_reg;
3635
3636 output_data += depth;
3637 acc0 = adjusted_bias_data;
3638 acc1 = adjusted_bias_data;
3639 acc2 = adjusted_bias_data;
3640 acc3 = adjusted_bias_data;
3641
3642 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3643 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3644 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3645 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3646 }
3647 }
3648
3649 if (residual_width > 0) {
3650 next_input_data += width_micro_stride;
3651 const int output_width = residual_width;
3652
3653 // Load next sub-micro block of data.
3654 int8x16_t right_bank_0_reg;
3655 int8x16_t right_bank_1_reg;
3656 int8x16_t right_bank_2_reg;
3657 int8x16_t right_bank_3_reg;
3658 int8x16_t right_bank_4_reg;
3659 int8x16_t right_bank_5_reg;
3660 // Logic: (output_width - 1) * stride_val < 2.
3661 const bool no_right_block = output_width < 3;
3662
3663 if (no_right_block) {
3664 // Only needed for sanitizer checks.
3665 right_bank_0_reg = vdupq_n_s8(0);
3666 right_bank_1_reg = vdupq_n_s8(0);
3667 right_bank_2_reg = vdupq_n_s8(0);
3668 right_bank_3_reg = vdupq_n_s8(0);
3669 right_bank_4_reg = vdupq_n_s8(0);
3670 right_bank_5_reg = vdupq_n_s8(0);
3671 } else {
3672 right_bank_0_reg = vld1q_s8(next_input_data);
3673 right_bank_1_reg =
3674 vld1q_s8(next_input_data + workspace_height_stride);
3675 right_bank_2_reg =
3676 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3677 right_bank_3_reg =
3678 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3679 right_bank_4_reg =
3680 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3681 right_bank_5_reg =
3682 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3683 }
3684
3685 // Iterate over input width shifts within 4x4 blocks.
3686 for (int x = 0; x < output_width; ++x) {
3687 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3688 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3689 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3690 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3691 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3692 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3693 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3694 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3695
3696 // Fixed-point multiplication.
3697 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3698 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3699 acc0, -output_shift);
3700 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3701 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3702 acc1, -output_shift);
3703 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3704 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3705 acc2, -output_shift);
3706 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3707 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3708 acc3, -output_shift);
3709 // Add the output offset.
3710 int16x8_t acc_s16_0_1 =
3711 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3712 int16x8_t acc_s16_2_3 =
3713 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3714 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3715 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3716 // Apply the activation function.
3717 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3718 vqmovxn_s16(acc_s16_2_3));
3719 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3720 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3721
3722 vst1q_lane_8x4(output_data, acc_u8_all, 0);
3723 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3724 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3725 2);
3726 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3727 3);
3728
3729 biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
3730 biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
3731 biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
3732 biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
3733 biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
3734 biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
3735
3736 output_data += depth;
3737
3738 acc0 = adjusted_bias_data;
3739 acc1 = adjusted_bias_data;
3740 acc2 = adjusted_bias_data;
3741 acc3 = adjusted_bias_data;
3742
3743 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3744 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3745 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3746 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3747 }
3748 }
3749 input_data_base += 4 * workspace_height_stride;
3750 output_data_base += 4 * output_height_stride;
3751
3752 // Move to next sub-block: advance to second set of filters, to new
3753 // bias.
3754 filter_reg_0_a = filter_reg_0_b;
3755 filter_reg_1_a = filter_reg_1_b;
3756 filter_reg_2_a = filter_reg_2_b;
3757 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
3758 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
3759 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
3760 }
3761 } else {
3762 const int8* input_data_base = input_data_depthwise;
3763 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3764 output_data_base = output_data_depthwise;
3765
3766 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
3767 bias_data += kBiasIncrement;
3768 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
3769 bias_data += kBiasIncrement;
3770
3771 for (int k_height = 0; k_height < block_height; ++k_height) {
3772 const int8* next_input_data = input_data_base;
3773 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3774 output_data = output_data_base;
3775
3776 // Load first sub-micro block of data into operational banks.
3777 int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
3778 int8x16_t left_bank_1_reg_a =
3779 vld1q_s8(next_input_data + workspace_height_stride);
3780 int8x16_t left_bank_2_reg_a =
3781 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3782 int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3783 int8x16_t left_bank_1_reg_b =
3784 vld1q_s8(next_input_data + workspace_height_stride + 16);
3785 int8x16_t left_bank_2_reg_b =
3786 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3787
3788 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3789 ++i_width) {
3790 next_input_data += width_micro_stride;
3791 const int output_width =
3792 i_width == output_width_micro_repeats ? residual_width : 4;
3793
3794 int8x16_t right_bank_0_reg_a;
3795 int8x16_t right_bank_1_reg_a;
3796 int8x16_t right_bank_2_reg_a;
3797 int8x16_t right_bank_0_reg_b;
3798 int8x16_t right_bank_1_reg_b;
3799 int8x16_t right_bank_2_reg_b;
3800 // Logic: (output_width - 1) * stride_val < 2.
3801 const bool no_right_block = output_width < 3;
3802
3803 // Load next sub-micro block of data.
3804 if (no_right_block) {
3805 // Only needed for sanitizer checks.
3806 right_bank_0_reg_a = vdupq_n_s8(0);
3807 right_bank_1_reg_a = vdupq_n_s8(0);
3808 right_bank_2_reg_a = vdupq_n_s8(0);
3809 right_bank_0_reg_b = vdupq_n_s8(0);
3810 right_bank_1_reg_b = vdupq_n_s8(0);
3811 right_bank_2_reg_b = vdupq_n_s8(0);
3812 } else {
3813 right_bank_0_reg_a = vld1q_s8(next_input_data);
3814 right_bank_1_reg_a =
3815 vld1q_s8(next_input_data + workspace_height_stride);
3816 right_bank_2_reg_a =
3817 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3818 right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3819 right_bank_1_reg_b =
3820 vld1q_s8(next_input_data + workspace_height_stride + 16);
3821 right_bank_2_reg_b =
3822 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3823 }
3824
3825 // Iterate over input width shifts within 4x4 blocks.
3826 for (int x = 0; x < output_width; ++x) {
3827 int32x4_t acc_a = adjusted_bias_data_a;
3828 int32x4_t acc_b = adjusted_bias_data_b;
3829 acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
3830 acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
3831 acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
3832 acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
3833 acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
3834 acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
3835
3836 // Fixed-point multiplication.
3837 acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
3838 acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
3839 acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3840 acc_a, -output_shift);
3841 acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3842 acc_b, -output_shift);
3843 // Add the output offset.
3844 int16x8_t acc_s16_0_0 =
3845 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
3846 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
3847 // Apply the activation function.
3848 uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
3849 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
3850 vget_low_u8(output_activation_min_vec));
3851 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
3852 vget_low_u8(output_activation_max_vec));
3853
3854 util_vst1_x8(output_data, acc_u8_0_0);
3855
3856 biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
3857 biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
3858 biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
3859 biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
3860 biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
3861 biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
3862
3863 output_data += depth;
3864 }
3865 }
3866 input_data_base += workspace_height_stride;
3867 output_data_base += output_height_stride;
3868 }
3869 }
3870 input_data_depthwise += depth_micro_stride;
3871 output_data_depthwise += 8;
3872 }
3873 } // NOLINT(readability/fn_size) Manually unrolled.
3874
3875 static inline void Run(const int8* scratch_block_data,
3876 const int8* filter_workspace, const int32* bias_data,
3877 uint8* output_block_data,
3878 const DepthwiseConvDotProdParams* function_params) {
3879 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
3880 output_block_data, function_params);
3881 }
3882 };
3883
3884 template <>
3885 struct KernelMacroBlock<
3886 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3887 QuantizationType::kNonPerChannelUint8,
3888 DepthwiseConvDepthMultiplication::kNoMultiplication,
3889 /*stride=*/2> {
3890 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3891 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3892 return vmin_u8(a, b);
3893 }
3894 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3895 return vmax_u8(a, b);
3896 }
3897
3898 static inline void KernelMacroBlockIntrinsics(
3899 const int8* scratch_block_data, const int8* filter_workspace,
3900 const int32* bias_data, uint8* output_block_data,
3901 const DepthwiseConvDotProdParams* function_params) {
3902 static constexpr QuantizationType quantization_type =
3903 QuantizationType::kNonPerChannelUint8;
3904
3905 const int workspace_height_stride =
3906 function_params->workspace_height_stride;
3907 const int input_width_overall_micro_repeats =
3908 function_params->input_width_overall_micro_repeats;
3909 const int output_width_micro_repeats =
3910 function_params->output_width_micro_repeats;
3911 const int depth_micro_repeats = function_params->depth_micro_repeats;
3912 const int depth = function_params->input_depth;
3913 constexpr int kStrideVal = 2;
3914 constexpr int kFourOverStride = 2;
3915 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
3916 TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
3917
3918 const int workspace_width_micro_repeats =
3919 function_params->workspace_width_micro_repeats;
3920 const int output_width_overall_micro_repeats =
3921 function_params->output_width_overall_micro_repeats;
3922 const int block_height = function_params->outbound_block_height;
3923 const int residual_width = function_params->output_residual_width;
3924 const int output_height_stride = function_params->output_height_stride;
3925 constexpr int kBiasIncrement = 4;
3926
3927 TFLITE_DCHECK(depth_micro_repeats > 0);
3928 const int width_micro_stride = 4 * 8;
3929 const int depth_micro_stride =
3930 width_micro_stride * input_width_overall_micro_repeats;
3931
3932 const int32 output_activation_min =
3933 function_params->quantized_activation_min;
3934 const int32 output_activation_max =
3935 function_params->quantized_activation_max;
3936 const int32 output_multiplier = function_params->output_multiplier;
3937 const int32 output_shift = function_params->output_shift;
3938 const int32 output_offset = function_params->output_offset;
3939 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3940 TFLITE_DCHECK_GE(output_activation_min, 0);
3941 TFLITE_DCHECK_LT(output_activation_min, 256);
3942 TFLITE_DCHECK_GE(output_activation_max, 0);
3943 TFLITE_DCHECK_LT(output_activation_max, 256);
3944 } else {
3945 TFLITE_DCHECK_GE(output_activation_min, -128);
3946 TFLITE_DCHECK_LT(output_activation_min, 128);
3947 TFLITE_DCHECK_GE(output_activation_max, -128);
3948 TFLITE_DCHECK_LT(output_activation_max, 128);
3949 }
3950 TFLITE_DCHECK_GE(output_offset, -32878);
3951 TFLITE_DCHECK_LT(output_offset, 32768);
3952
3953 // This version only does min/max on 64 bits.
3954 const int16x8_t output_offset_vec =
3955 vdupq_n_s16(static_cast<int16>(output_offset));
3956 const uint8x8_t output_activation_min_vec =
3957 vdup_n_u8(static_cast<uint8>(output_activation_min));
3958 const uint8x8_t output_activation_max_vec =
3959 vdup_n_u8(static_cast<uint8>(output_activation_max));
3960
3961 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3962
3963 TFLITE_DCHECK_LE(block_height, 2);
3964
3965 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3966 const int8* filter_block =
3967 filter_workspace + shuffled_filter_increment * j_depth;
3968
3969 if (block_height == 2) {
3970 for (int s = 0; s < 2; ++s) {
3971 // Simulate NEON-register transposition of subset of filter.
3972 int8x16_t filter_reg_0_a;
3973 int8x16_t filter_reg_1_a;
3974 int8x16_t filter_reg_2_a;
3975
3976 filter_reg_0_a = vld1q_s8(filter_block + s * 16);
3977 filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
3978 filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
3979
3980 const int8* scratch_data =
3981 scratch_block_data + depth_micro_stride * j_depth;
3982 typename QuantizationTypeImpl<quantization_type>::ExternalType*
3983 output_data = output_block_data + 8 * j_depth;
3984 const int8* input_data_0 = scratch_data + s * 2 * 8;
3985
3986 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
3987
3988 // Load first sub-micro block of data into operational banks.
3989 int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
3990 int8x16_t left_bank_1_reg =
3991 vld1q_s8(input_data_0 + workspace_height_stride);
3992 int8x16_t left_bank_2_reg =
3993 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
3994 int8x16_t left_bank_3_reg =
3995 vld1q_s8(input_data_0 + 3 * workspace_height_stride);
3996 int8x16_t left_bank_4_reg =
3997 vld1q_s8(input_data_0 + 4 * workspace_height_stride);
3998
3999 int8x16_t right_bank_0_reg;
4000 int8x16_t right_bank_1_reg;
4001 int8x16_t right_bank_2_reg;
4002 int8x16_t right_bank_3_reg;
4003 int8x16_t right_bank_4_reg;
4004
4005 int32x4_t acc0;
4006 int32x4_t acc1;
4007 int16x8_t acc_s16_0_1;
4008 uint8x8_t acc_u8;
4009
4010 int i_width = 0;
4011
4012 // When output_width_micro_repeats <
4013 // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
4014 // residual_width == 1 is then true iff residual_width < 2.
4015 const int adjusted_width_micro_repeats =
4016 (output_width_micro_repeats <
4017 output_width_overall_micro_repeats) &&
4018 (residual_width == 1)
4019 ? output_width_micro_repeats
4020 : output_width_overall_micro_repeats;
4021
4022 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4023 const int output_width = kFourOverStride;
4024 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4025 const int8* input_data =
4026 input_data_0 + width_micro_stride * i_width;
4027 acc0 = adjusted_bias_data;
4028 acc1 = adjusted_bias_data;
4029 right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
4030 right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
4031 workspace_height_stride);
4032
4033 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4034 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4035 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4036 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4037
4038 right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
4039 2 * workspace_height_stride);
4040 right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
4041 3 * workspace_height_stride);
4042 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4043 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4044 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4045 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4046 right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
4047 4 * workspace_height_stride);
4048
4049 // Fixed-point multiplication.
4050 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4051 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4052 acc0, -output_shift);
4053 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4054 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4055 acc1, -output_shift);
4056 // Add the output offset.
4057 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4058 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4059 // Apply the activation function.
4060 acc_u8 = vqmovxn_s16(acc_s16_0_1);
4061 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4062 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4063
4064 left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
4065 left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
4066 left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
4067 left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
4068 left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
4069 acc0 = adjusted_bias_data;
4070 acc1 = adjusted_bias_data;
4071 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4072 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4073 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4074 vst1_lane_8x4(output_data_base, acc_u8, 0);
4075 vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
4076
4077 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4078 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4079
4080 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4081 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4082 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4083 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4084 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4085 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4086
4087 // Fixed-point multiplication.
4088 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4089 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4090 acc0, -output_shift);
4091 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4092 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4093 acc1, -output_shift);
4094 // Add the output offset.
4095 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4096 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4097 // Apply the activation function.
4098 acc_u8 = vqmovxn_s16(acc_s16_0_1);
4099 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4100 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4101
4102 vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
4103 vst1_lane_8x4(output_data_base + depth + output_height_stride,
4104 acc_u8, 1);
4105
4106 left_bank_0_reg = right_bank_0_reg;
4107 left_bank_1_reg = right_bank_1_reg;
4108 left_bank_2_reg = right_bank_2_reg;
4109 left_bank_3_reg = right_bank_3_reg;
4110 left_bank_4_reg = right_bank_4_reg;
4111 }
4112 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
4113 TFLITE_DCHECK_NE(residual_width, kFourOverStride);
4114
4115 // No need to load next ("right") block of data.
4116
4117 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4118 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4119
4120 // Iterate over input width shifts within 4x4 blocks.
4121 {
4122 acc0 = adjusted_bias_data;
4123 acc1 = adjusted_bias_data;
4124
4125 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4126 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4127 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4128 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4129 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4130 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4131
4132 // Fixed-point multiplication.
4133 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4134 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4135 acc0, -output_shift);
4136 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4137 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4138 acc1, -output_shift);
4139 // Add the output offset.
4140 int16x8_t acc_s16_0_1 =
4141 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4142 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4143 // Apply the activation function.
4144 uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4145 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4146 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4147
4148 vst1_lane_8x4(output_data_base, acc_u8, 0);
4149 vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
4150
4151 left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
4152 left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
4153 left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
4154 left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
4155 left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
4156 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4157 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4158 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4159 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4160 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4161 }
4162 }
4163 bias_data += kBiasIncrement;
4164 }
4165 } else {
4166 // block_height == 1.
4167 int8x16_t filter_reg_0_a;
4168 int8x16_t filter_reg_1_a;
4169 int8x16_t filter_reg_2_a;
4170 int8x16_t filter_reg_0_b;
4171 int8x16_t filter_reg_1_b;
4172 int8x16_t filter_reg_2_b;
4173
4174 filter_reg_0_a = vld1q_s8(filter_block);
4175 filter_reg_1_a = vld1q_s8(filter_block + 32);
4176 filter_reg_2_a = vld1q_s8(filter_block + 64);
4177 filter_reg_0_b = vld1q_s8(filter_block + 16);
4178 filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
4179 filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
4180
4181 const int8* scratch_data =
4182 scratch_block_data + depth_micro_stride * j_depth;
4183 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4184 output_data = output_block_data + 8 * j_depth;
4185 const int8* input_data_0 = scratch_data;
4186
4187 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4188 bias_data += kBiasIncrement;
4189 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4190 bias_data += kBiasIncrement;
4191
4192 // Load first sub-micro block of data into operational banks.
4193 int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
4194 int8x16_t left_bank_1_reg_a =
4195 vld1q_s8(input_data_0 + workspace_height_stride);
4196 int8x16_t left_bank_2_reg_a =
4197 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
4198 int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
4199 int8x16_t left_bank_1_reg_b =
4200 vld1q_s8(input_data_0 + workspace_height_stride + 16);
4201 int8x16_t left_bank_2_reg_b =
4202 vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
4203
4204 int8x16_t right_bank_0_reg_a;
4205 int8x16_t right_bank_1_reg_a;
4206 int8x16_t right_bank_2_reg_a;
4207 int8x16_t right_bank_0_reg_b;
4208 int8x16_t right_bank_1_reg_b;
4209 int8x16_t right_bank_2_reg_b;
4210
4211 int32x4_t acc0_a;
4212 int32x4_t acc0_b;
4213
4214 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
4215 ++i_width) {
4216 const int output_width = i_width == output_width_micro_repeats
4217 ? residual_width
4218 : kFourOverStride;
4219 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4220 const int8* input_data = input_data_0 + width_micro_stride * i_width;
4221 const bool no_right_block = i_width == output_width_micro_repeats &&
4222 output_width_overall_micro_repeats ==
4223 workspace_width_micro_repeats;
4224
4225 if (!no_right_block) {
4226 // Load next sub-micro block of data.
4227 right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
4228 right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
4229 workspace_height_stride);
4230 right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
4231 2 * workspace_height_stride);
4232 right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
4233 right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
4234 workspace_height_stride + 16);
4235 right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
4236 2 * workspace_height_stride + 16);
4237 }
4238
4239 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4240 output_data_base = output_data + depth * 2 * i_width;
4241
4242 // Iterate over input width shifts within 4x4 blocks.
4243 {
4244 acc0_a = adjusted_bias_data_a;
4245 acc0_b = adjusted_bias_data_b;
4246
4247 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4248 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4249 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4250 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4251 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4252 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4253
4254 // Fixed-point multiplication.
4255 acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4256 acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4257 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4258 acc0_a, -output_shift);
4259 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4260 acc0_b, -output_shift);
4261 // Add the output offset.
4262 int16x8_t acc_s16_0_1 =
4263 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4264 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4265 // Apply the activation function.
4266 uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4267 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4268 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4269
4270 util_vst1_x8(output_data_base, acc_u8);
4271
4272 left_bank_0_reg_a = vrev32q_u16(left_bank_0_reg_a);
4273 left_bank_1_reg_a = vrev32q_u16(left_bank_1_reg_a);
4274 left_bank_2_reg_a = vrev32q_u16(left_bank_2_reg_a);
4275 left_bank_0_reg_b = vrev32q_u16(left_bank_0_reg_b);
4276 left_bank_1_reg_b = vrev32q_u16(left_bank_1_reg_b);
4277 left_bank_2_reg_b = vrev32q_u16(left_bank_2_reg_b);
4278 vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
4279 vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
4280 vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
4281 vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
4282 vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
4283 vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
4284 }
4285
4286 if (output_width > 1) {
4287 acc0_a = adjusted_bias_data_a;
4288 acc0_b = adjusted_bias_data_b;
4289
4290 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4291 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4292 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4293 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4294 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4295 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4296
4297 // Fixed-point multiplication.
4298 acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4299 acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4300 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4301 acc0_a, -output_shift);
4302 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4303 acc0_b, -output_shift);
4304 // Add the output offset.
4305 int16x8_t acc_s16_0_1 =
4306 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4307 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4308 // Apply the activation function.
4309 uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4310 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4311 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4312
4313 util_vst1_x8(output_data_base + depth, acc_u8);
4314
4315 left_bank_0_reg_a = right_bank_0_reg_a;
4316 left_bank_1_reg_a = right_bank_1_reg_a;
4317 left_bank_2_reg_a = right_bank_2_reg_a;
4318 left_bank_0_reg_b = right_bank_0_reg_b;
4319 left_bank_1_reg_b = right_bank_1_reg_b;
4320 left_bank_2_reg_b = right_bank_2_reg_b;
4321 }
4322 }
4323 }
4324 }
4325 } // NOLINT(readability/fn_size) Manually unrolled.
4326
4327 static inline void Run(const int8* scratch_block_data,
4328 const int8* filter_workspace, const int32* bias_data,
4329 uint8* output_block_data,
4330 const DepthwiseConvDotProdParams* function_params) {
4331 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
4332 output_block_data, function_params);
4333 }
4334 };
4335
4336 template <>
4337 struct KernelMacroBlock<
4338 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
4339 QuantizationType::kNonPerChannelUint8,
4340 DepthwiseConvDepthMultiplication::kUnitInputDepth,
4341 /*stride=*/1> {
4342 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
4343 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
4344 return vmin_u8(a, b);
4345 }
4346 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
4347 return vmax_u8(a, b);
4348 }
4349 static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
4350 return vminq_u8(a, b);
4351 }
4352 static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
4353 return vmaxq_u8(a, b);
4354 }
4355
4356 static inline void KernelMacroBlockIntrinsics(
4357 const int8* scratch_block_data, const int8* filter_workspace,
4358 const int32* bias_data, uint8* output_block_data,
4359 const DepthwiseConvDotProdParams* function_params) {
4360 static constexpr QuantizationType quantization_type =
4361 QuantizationType::kNonPerChannelUint8;
4362
4363 TFLITE_DCHECK_EQ(function_params->stride, 1);
4364 const int workspace_height_stride =
4365 function_params->workspace_height_stride;
4366 const int output_width_micro_repeats =
4367 function_params->output_width_micro_repeats;
4368 const int depth_micro_repeats = function_params->depth_micro_repeats;
4369 const int output_depth = function_params->output_depth;
4370
4371 const int output_width_overall_micro_repeats =
4372 function_params->output_width_overall_micro_repeats;
4373 const int block_height = function_params->outbound_block_height;
4374 const int residual_width = function_params->output_residual_width;
4375 const int output_height_stride = function_params->output_height_stride;
4376 constexpr int kBiasIncrement = 4;
4377
4378 TFLITE_DCHECK(depth_micro_repeats > 0);
4379
4380 const int32 output_activation_min =
4381 function_params->quantized_activation_min;
4382 const int32 output_activation_max =
4383 function_params->quantized_activation_max;
4384 const int32 output_multiplier = function_params->output_multiplier;
4385 const int32 output_shift = function_params->output_shift;
4386 const int32 output_offset = function_params->output_offset;
4387 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
4388 TFLITE_DCHECK_GE(output_activation_min, 0);
4389 TFLITE_DCHECK_LT(output_activation_min, 256);
4390 TFLITE_DCHECK_GE(output_activation_max, 0);
4391 TFLITE_DCHECK_LT(output_activation_max, 256);
4392 } else {
4393 TFLITE_DCHECK_GE(output_activation_min, -128);
4394 TFLITE_DCHECK_LT(output_activation_min, 128);
4395 TFLITE_DCHECK_GE(output_activation_max, -128);
4396 TFLITE_DCHECK_LT(output_activation_max, 128);
4397 }
4398 TFLITE_DCHECK_GE(output_offset, -32878);
4399 TFLITE_DCHECK_LT(output_offset, 32768);
4400
4401 const int16x8_t output_offset_vec =
4402 vdupq_n_s16(static_cast<int16>(output_offset));
4403 const uint8x16_t output_activation_min_vec =
4404 vdupq_n_u8(static_cast<uint8>(output_activation_min));
4405 const uint8x16_t output_activation_max_vec =
4406 vdupq_n_u8(static_cast<uint8>(output_activation_max));
4407
4408 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4409 output_data_depthwise = output_block_data;
4410 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
4411 // Simulate NEON-register transposition of subset of filter.
4412 int8x16_t filter_reg_0_a;
4413 int8x16_t filter_reg_0_b;
4414 int8x16_t filter_reg_1_a;
4415 int8x16_t filter_reg_1_b;
4416 int8x16_t filter_reg_2_a;
4417 int8x16_t filter_reg_2_b;
4418 int8x16_t filter_reg_0_a_shifted;
4419 int8x16_t filter_reg_1_a_shifted;
4420 int8x16_t filter_reg_2_a_shifted;
4421
4422 filter_reg_0_a = vld1q_s8(filter_workspace);
4423 filter_workspace += 16;
4424 filter_reg_0_b = vld1q_s8(filter_workspace);
4425 filter_workspace += 16;
4426 filter_reg_1_a = vld1q_s8(filter_workspace);
4427 filter_workspace += 16;
4428 filter_reg_1_b = vld1q_s8(filter_workspace);
4429 filter_workspace += 16;
4430 filter_reg_2_a = vld1q_s8(filter_workspace);
4431 filter_workspace += 16;
4432 filter_reg_2_b = vld1q_s8(filter_workspace);
4433 filter_workspace += 16;
4434
4435 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
4436 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
4437 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
4438
4439 // When output_width_micro_repeats < output_width_overall_micro_repeats,
4440 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
4441 // residual_width < 2.
4442 const int adjusted_width_micro_repeats =
4443 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
4444 (residual_width < 4)
4445 ? output_width_micro_repeats
4446 : output_width_overall_micro_repeats;
4447
4448 if (block_height == 4) {
4449 for (int s = 0; s < 2; ++s) {
4450 // Work through one slice, by row, at a time.
4451 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4452 output_data_base = output_data_depthwise + 4 * s;
4453
4454 const int8* next_input_data = scratch_block_data;
4455 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4456 output_data = output_data_base;
4457
4458 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
4459 bias_data += kBiasIncrement;
4460
4461 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
4462 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
4463 int8x16_t input_bank_c_reg; // left 4, right 4, left 5, right 5.
4464
4465 // Load first sub-micro block of data into operational banks.
4466 input_bank_a_reg =
4467 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
4468 // uninitialized variable.
4469 input_bank_a_reg = vld1q_lane_8x4(
4470 next_input_data + workspace_height_stride, input_bank_a_reg, 2);
4471 input_bank_b_reg = vld1q_dup_s8x4(
4472 next_input_data +
4473 2 * workspace_height_stride); // Load lane 0, avoiding
4474 // uninitialized variable.
4475 input_bank_b_reg =
4476 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4477 input_bank_b_reg, 2);
4478 input_bank_c_reg = vld1q_dup_s8x4(
4479 next_input_data +
4480 4 * workspace_height_stride); // Load lane 0, avoiding
4481 // uninitialized variable.
4482 input_bank_c_reg =
4483 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4484 input_bank_c_reg, 2);
4485
4486 int32x4_t acc0;
4487 int32x4_t acc1;
4488 int32x4_t acc2;
4489 int32x4_t acc3;
4490
4491 acc0 = adjusted_bias_data;
4492 acc1 = adjusted_bias_data;
4493 acc2 = adjusted_bias_data;
4494 acc3 = adjusted_bias_data;
4495
4496 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
4497 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
4498 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
4499 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
4500
4501 int i_width = 0;
4502 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4503 next_input_data += 4;
4504
4505 // Iterate over input width shifts within 4x4 blocks.
4506 {
4507 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4508 0);
4509 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4510 2);
4511 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4512 2);
4513 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4514 2);
4515 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4516 2);
4517 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4518 0);
4519 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4520 0);
4521 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4522 2);
4523
4524 // Fixed-point multiplication.
4525 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4526 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4527 acc0, -output_shift);
4528 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4529 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4530 acc1, -output_shift);
4531 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4532 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4533 acc2, -output_shift);
4534 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4535 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4536 acc3, -output_shift);
4537 // Add the output offset.
4538 int16x8_t acc_s16_0_1 =
4539 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4540 int16x8_t acc_s16_2_3 =
4541 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4542 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4543 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4544 // Apply the activation function.
4545 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4546 vqmovxn_s16(acc_s16_2_3));
4547 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4548 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4549
4550 vst1q_lane_8x4(output_data, acc_u8_all, 0);
4551 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4552 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4553 2);
4554 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4555 3);
4556
4557 output_data += output_depth;
4558 }
4559 // Load next sub-micro block of data.
4560 input_bank_a_reg =
4561 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4562 input_bank_a_reg = vld1q_lane_8x4(
4563 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4564 input_bank_b_reg =
4565 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4566 input_bank_b_reg, 1);
4567 input_bank_b_reg =
4568 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4569 input_bank_b_reg, 3);
4570 input_bank_c_reg =
4571 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4572 input_bank_c_reg, 1);
4573 input_bank_c_reg =
4574 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4575 input_bank_c_reg, 3);
4576
4577 {
4578 acc0 = adjusted_bias_data;
4579 acc1 = adjusted_bias_data;
4580 acc2 = adjusted_bias_data;
4581 acc3 = adjusted_bias_data;
4582
4583 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4584 input_bank_a_reg, 0);
4585 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4586 input_bank_a_reg, 2);
4587 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4588 input_bank_b_reg, 0);
4589 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4590 input_bank_a_reg, 2);
4591 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4592 input_bank_b_reg, 0);
4593 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4594 input_bank_b_reg, 2);
4595 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4596 input_bank_b_reg, 0);
4597 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4598 input_bank_b_reg, 2);
4599 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4600 input_bank_c_reg, 0);
4601 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4602 input_bank_b_reg, 2);
4603 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4604 input_bank_c_reg, 0);
4605 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4606 input_bank_c_reg, 2);
4607
4608 // Fixed-point multiplication.
4609 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4610 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4611 acc0, -output_shift);
4612 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4613 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4614 acc1, -output_shift);
4615 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4616 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4617 acc2, -output_shift);
4618 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4619 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4620 acc3, -output_shift);
4621 // Add the output offset.
4622 int16x8_t acc_s16_0_1 =
4623 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4624 int16x8_t acc_s16_2_3 =
4625 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4626 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4627 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4628 // Apply the activation function.
4629 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4630 vqmovxn_s16(acc_s16_2_3));
4631 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4632 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4633
4634 vst1q_lane_8x4(output_data, acc_u8_all, 0);
4635 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4636 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4637 2);
4638 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4639 3);
4640
4641 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
4642 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
4643 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
4644
4645 output_data += output_depth;
4646 }
4647
4648 {
4649 acc0 = adjusted_bias_data;
4650 acc1 = adjusted_bias_data;
4651 acc2 = adjusted_bias_data;
4652 acc3 = adjusted_bias_data;
4653
4654 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4655 0);
4656 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4657 2);
4658 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4659 0);
4660 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4661 2);
4662 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4663 0);
4664 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4665 2);
4666 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4667 0);
4668 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4669 2);
4670 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4671 0);
4672 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4673 2);
4674 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4675 0);
4676 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4677 2);
4678
4679 // Fixed-point multiplication.
4680 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4681 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4682 acc0, -output_shift);
4683 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4684 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4685 acc1, -output_shift);
4686 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4687 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4688 acc2, -output_shift);
4689 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4690 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4691 acc3, -output_shift);
4692 // Add the output offset.
4693 int16x8_t acc_s16_0_1 =
4694 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4695 int16x8_t acc_s16_2_3 =
4696 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4697 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4698 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4699 // Apply the activation function.
4700 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4701 vqmovxn_s16(acc_s16_2_3));
4702 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4703 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4704
4705 vst1q_lane_8x4(output_data, acc_u8_all, 0);
4706 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4707 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4708 2);
4709 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4710 3);
4711
4712 output_data += output_depth;
4713 }
4714
4715 {
4716 acc0 = adjusted_bias_data;
4717 acc1 = adjusted_bias_data;
4718 acc2 = adjusted_bias_data;
4719 acc3 = adjusted_bias_data;
4720
4721 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4722 input_bank_a_reg, 0);
4723 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4724 input_bank_a_reg, 2);
4725 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4726 input_bank_b_reg, 0);
4727 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4728 input_bank_a_reg, 2);
4729 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4730 input_bank_b_reg, 0);
4731 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4732 input_bank_b_reg, 2);
4733 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4734 input_bank_b_reg, 0);
4735 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4736 input_bank_b_reg, 2);
4737 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4738 input_bank_c_reg, 0);
4739 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4740 input_bank_b_reg, 2);
4741 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4742 input_bank_c_reg, 0);
4743 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4744 input_bank_c_reg, 2);
4745
4746 // Fixed-point multiplication.
4747 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4748 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4749 acc0, -output_shift);
4750 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4751 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4752 acc1, -output_shift);
4753 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4754 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4755 acc2, -output_shift);
4756 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4757 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4758 acc3, -output_shift);
4759 // Add the output offset.
4760 int16x8_t acc_s16_0_1 =
4761 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4762 int16x8_t acc_s16_2_3 =
4763 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4764 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4765 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4766 // Apply the activation function.
4767 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4768 vqmovxn_s16(acc_s16_2_3));
4769 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4770 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4771
4772 vst1q_lane_8x4(output_data, acc_u8_all, 0);
4773 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4774 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4775 2);
4776 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4777 3);
4778
4779 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
4780 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
4781 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
4782
4783 output_data += output_depth;
4784 acc0 = adjusted_bias_data;
4785 acc1 = adjusted_bias_data;
4786 acc2 = adjusted_bias_data;
4787 acc3 = adjusted_bias_data;
4788
4789 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4790 0);
4791 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4792 0);
4793 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4794 0);
4795 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4796 2);
4797 }
4798 }
4799
4800 if (i_width < output_width_overall_micro_repeats) {
4801 next_input_data += 4;
4802 const int output_width = residual_width;
4803
4804 // Load next sub-micro block of data.
4805 input_bank_a_reg =
4806 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4807 input_bank_a_reg = vld1q_lane_8x4(
4808 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4809 input_bank_b_reg =
4810 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4811 input_bank_b_reg, 1);
4812 input_bank_b_reg =
4813 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4814 input_bank_b_reg, 3);
4815 input_bank_c_reg =
4816 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4817 input_bank_c_reg, 1);
4818 input_bank_c_reg =
4819 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4820 input_bank_c_reg, 3);
4821
4822 // Iterate over input width shifts within 4x4 blocks.
4823 for (int x = 0; x < output_width; ++x) {
4824 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4825 0);
4826 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4827 2);
4828 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4829 2);
4830 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4831 2);
4832 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4833 2);
4834 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4835 0);
4836 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4837 0);
4838 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4839 2);
4840
4841 // Fixed-point multiplication.
4842 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4843 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4844 acc0, -output_shift);
4845 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4846 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4847 acc1, -output_shift);
4848 acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4849 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4850 acc2, -output_shift);
4851 acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4852 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4853 acc3, -output_shift);
4854 // Add the output offset.
4855 int16x8_t acc_s16_0_1 =
4856 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4857 int16x8_t acc_s16_2_3 =
4858 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4859 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4860 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4861 // Apply the activation function.
4862 uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4863 vqmovxn_s16(acc_s16_2_3));
4864 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4865 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4866
4867 vst1q_lane_8x4(output_data, acc_u8_all, 0);
4868 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4869 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4870 2);
4871 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4872 3);
4873
4874 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
4875 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
4876 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);
4877
4878 output_data += output_depth;
4879
4880 acc0 = adjusted_bias_data;
4881 acc1 = adjusted_bias_data;
4882 acc2 = adjusted_bias_data;
4883 acc3 = adjusted_bias_data;
4884
4885 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4886 0);
4887 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4888 0);
4889 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4890 0);
4891 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4892 2);
4893 }
4894 }
4895 // scratch_block_data += 4 * workspace_height_stride;
4896 output_data_base += 4 * output_height_stride;
4897
4898 // Move to next sub-block: advance to second set of filters, to new
4899 // bias.
4900 filter_reg_0_a = filter_reg_0_b;
4901 filter_reg_1_a = filter_reg_1_b;
4902 filter_reg_2_a = filter_reg_2_b;
4903 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
4904 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
4905 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
4906 }
4907 } else {
4908 // Block height < 4.
4909 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4910 output_data_base = output_data_depthwise;
4911
4912 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4913 bias_data += kBiasIncrement;
4914 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4915 bias_data += kBiasIncrement;
4916
4917 for (int k_height = 0; k_height < block_height; ++k_height) {
4918 const int8* next_input_data =
4919 scratch_block_data + k_height * workspace_height_stride;
4920 typename QuantizationTypeImpl<quantization_type>::ExternalType*
4921 output_data = output_data_base;
4922
4923 int8x16_t input_bank_p_reg; // left 0, right 0, left 1, right 1.
4924 int8x16_t input_bank_q_reg; // left 2, right 2, left 3, right 3.
4925
4926 // Load first sub-micro block of data into operational banks.
4927 input_bank_p_reg =
4928 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
4929 // uninitialized variable.
4930 input_bank_p_reg = vld1q_lane_8x4(
4931 next_input_data + workspace_height_stride, input_bank_p_reg, 2);
4932 input_bank_q_reg = vld1q_dup_s8x4(
4933 next_input_data +
4934 2 * workspace_height_stride); // Load lane 0, avoiding
4935 // uninitialized variable.
4936
4937 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
4938 ++i_width) {
4939 next_input_data += 4;
4940 const int output_width =
4941 i_width == output_width_micro_repeats ? residual_width : 4;
4942
4943 // Load next sub-micro block of data.
4944 input_bank_p_reg =
4945 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
4946 input_bank_p_reg = vld1q_lane_8x4(
4947 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
4948 input_bank_q_reg =
4949 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4950 input_bank_q_reg, 1);
4951 // Iterate over input width shifts within 4x4 blocks.
4952 for (int x = 0; x < output_width; ++x) {
4953 int32x4_t acc_a = adjusted_bias_data_a;
4954 int32x4_t acc_b = adjusted_bias_data_b;
4955 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
4956 input_bank_p_reg, 0);
4957 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
4958 input_bank_p_reg, 2);
4959 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
4960 input_bank_q_reg, 0);
4961 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
4962 input_bank_p_reg, 0);
4963 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
4964 input_bank_p_reg, 2);
4965 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
4966 input_bank_q_reg, 0);
4967
4968 // Fixed-point multiplication.
4969 acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
4970 acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
4971 acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4972 acc_a, -output_shift);
4973 acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4974 acc_b, -output_shift);
4975 // Add the output offset.
4976 int16x8_t acc_s16_0_0 =
4977 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
4978 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
4979 // Apply the activation function.
4980 uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
4981 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
4982 vget_low_u8(output_activation_min_vec));
4983 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
4984 vget_low_u8(output_activation_max_vec));
4985
4986 util_vst1_x8(output_data, acc_u8_0_0);
4987
4988 input_bank_p_reg = vshrq_n_u64(input_bank_p_reg, 8);
4989 input_bank_q_reg = vshrq_n_u64(input_bank_q_reg, 8);
4990
4991 output_data += output_depth;
4992 }
4993 }
4994 output_data_base += output_height_stride;
4995 }
4996 }
4997 output_data_depthwise += 8;
4998 }
4999 } // NOLINT(readability/fn_size) Manually unrolled.
5000
5001 static inline void Run(const int8* scratch_block_data,
5002 const int8* filter_workspace, const int32* bias_data,
5003 uint8* output_block_data,
5004 const DepthwiseConvDotProdParams* function_params) {
5005 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5006 output_block_data, function_params);
5007 }
5008 };
5009
5010 template <>
5011 struct KernelMacroBlock<
5012 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5013 QuantizationType::kNonPerChannelUint8,
5014 DepthwiseConvDepthMultiplication::kUnitInputDepth,
5015 /*stride=*/2> {
5016 static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
5017 static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
5018 return vmin_u8(a, b);
5019 }
5020 static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
5021 return vmax_u8(a, b);
5022 }
5023
5024 static inline void KernelMacroBlockIntrinsics(
5025 const int8* scratch_block_data, const int8* filter_workspace,
5026 const int32* bias_data, uint8* output_block_data,
5027 const DepthwiseConvDotProdParams* function_params) {
5028 static constexpr QuantizationType quantization_type =
5029 QuantizationType::kNonPerChannelUint8;
5030
5031 const int workspace_height_stride =
5032 function_params->workspace_height_stride;
5033 const int output_width_micro_repeats =
5034 function_params->output_width_micro_repeats;
5035 const int depth_micro_repeats = function_params->depth_micro_repeats;
5036 const int output_depth = function_params->output_depth;
5037 constexpr int kStrideVal = 2;
5038 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
5039
5040 const int output_width_overall_micro_repeats =
5041 function_params->output_width_overall_micro_repeats;
5042 const int block_height = function_params->outbound_block_height;
5043 const int residual_width = function_params->output_residual_width;
5044 const int output_height_stride = function_params->output_height_stride;
5045 constexpr int kBiasIncrement = 4;
5046
5047 const int32 output_activation_min =
5048 function_params->quantized_activation_min;
5049 const int32 output_activation_max =
5050 function_params->quantized_activation_max;
5051 const int32 output_multiplier = function_params->output_multiplier;
5052 const int32 output_shift = function_params->output_shift;
5053 const int32 output_offset = function_params->output_offset;
5054 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5055 TFLITE_DCHECK_GE(output_activation_min, 0);
5056 TFLITE_DCHECK_LT(output_activation_min, 256);
5057 TFLITE_DCHECK_GE(output_activation_max, 0);
5058 TFLITE_DCHECK_LT(output_activation_max, 256);
5059 } else {
5060 TFLITE_DCHECK_GE(output_activation_min, -128);
5061 TFLITE_DCHECK_LT(output_activation_min, 128);
5062 TFLITE_DCHECK_GE(output_activation_max, -128);
5063 TFLITE_DCHECK_LT(output_activation_max, 128);
5064 }
5065 TFLITE_DCHECK_GE(output_offset, -32878);
5066 TFLITE_DCHECK_LT(output_offset, 32768);
5067
5068 TFLITE_DCHECK_GE(depth_micro_repeats, 1);
5069
5070 const int16x8_t output_offset_vec =
5071 vdupq_n_s16(static_cast<int16>(output_offset));
5072 const uint8x16_t output_activation_min_vec =
5073 vdupq_n_u8(static_cast<uint8>(output_activation_min));
5074 const uint8x16_t output_activation_max_vec =
5075 vdupq_n_u8(static_cast<uint8>(output_activation_max));
5076
5077 for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
5078 int8x16_t filter_reg_0_a;
5079 int8x16_t filter_reg_0_b;
5080 int8x16_t filter_reg_1_a;
5081 int8x16_t filter_reg_1_b;
5082 int8x16_t filter_reg_2_a;
5083 int8x16_t filter_reg_2_b;
5084
5085 filter_reg_0_a = vld1q_s8(filter_workspace);
5086 filter_workspace += 16;
5087 filter_reg_0_b = vld1q_s8(filter_workspace);
5088 filter_workspace += 16;
5089 filter_reg_1_a = vld1q_s8(filter_workspace);
5090 filter_workspace += 16;
5091 filter_reg_1_b = vld1q_s8(filter_workspace);
5092 filter_workspace += 16;
5093 filter_reg_2_a = vld1q_s8(filter_workspace);
5094 filter_workspace += 16;
5095 filter_reg_2_b = vld1q_s8(filter_workspace);
5096 filter_workspace += 16;
5097
5098 const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
5099 bias_data += kBiasIncrement;
5100 const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
5101 bias_data += kBiasIncrement;
5102
5103 if (block_height == 2) {
5104 const int8* scratch_data = scratch_block_data;
5105 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5106 output_data = output_block_data + 8 * j_depth;
5107
5108 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
5109 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
5110 int8x16_t input_bank_c_reg; // left 4, right 4, xxx, xxx.
5111
5112 // Load first sub-micro block of data into operational banks.
5113 input_bank_a_reg =
5114 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
5115 // uninitialized variable.
5116 input_bank_a_reg = vld1q_lane_8x4(
5117 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5118 input_bank_b_reg = vld1q_dup_s8x4(
5119 scratch_data +
5120 2 * workspace_height_stride); // Load lane 0, avoiding
5121 // uninitialized variable.
5122 input_bank_b_reg = vld1q_lane_8x4(
5123 scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
5124 input_bank_c_reg = vld1q_dup_s8x4(
5125 scratch_data +
5126 4 * workspace_height_stride); // Load lane 0, avoiding
5127 // uninitialized variable.
5128
5129 int32x4_t acc0;
5130 int32x4_t acc1;
5131
5132 // When output_width_micro_repeats < output_width_overall_micro_repeats,
5133 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
5134 // residual_width < 2.
5135 const int adjusted_width_micro_repeats =
5136 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
5137 (residual_width < 2)
5138 ? output_width_micro_repeats
5139 : output_width_overall_micro_repeats;
5140
5141 int i_width = 0;
5142 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
5143 const int8* input_data = scratch_data + 4 + 4 * i_width;
5144
5145 // Load next sub-micro block of data.
5146 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5147 input_bank_a_reg = vld1q_lane_8x4(
5148 input_data + workspace_height_stride, input_bank_a_reg, 3);
5149 input_bank_b_reg = vld1q_lane_8x4(
5150 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5151 input_bank_b_reg = vld1q_lane_8x4(
5152 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5153 input_bank_c_reg = vld1q_lane_8x4(
5154 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5155
5156 int16x8_t acc_s16_0_1;
5157 uint8x8_t acc_u8_0_1;
5158 // Iterate over input width shifts within 4x4 blocks.
5159 {
5160 acc0 = adjusted_bias_data_s_0;
5161 acc1 = adjusted_bias_data_s_0;
5162
5163 acc0 =
5164 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5165 acc0 =
5166 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5167 acc0 =
5168 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5169 acc1 =
5170 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5171 acc1 =
5172 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5173 acc1 =
5174 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5175
5176 // Fixed-point multiplication.
5177 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5178 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5179 acc0, -output_shift);
5180 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5181 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5182 acc1, -output_shift);
5183 // Add the output offset.
5184 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5185 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5186 // Apply the activation function.
5187 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5188 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5189 vget_low_u8(output_activation_min_vec));
5190 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5191 vget_low_u8(output_activation_max_vec));
5192
5193 vst1_lane_8x4(output_data, acc_u8_0_1, 0);
5194 vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5195
5196 acc0 = adjusted_bias_data_s_1;
5197 acc1 = adjusted_bias_data_s_1;
5198
5199 acc0 =
5200 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5201 acc0 =
5202 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5203 acc0 =
5204 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5205 acc1 =
5206 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5207 acc1 =
5208 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5209 acc1 =
5210 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5211
5212 // Fixed-point multiplication.
5213 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5214 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5215 acc0, -output_shift);
5216 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5217 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5218 acc1, -output_shift);
5219 // Add the output offset.
5220 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5221 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5222 // Apply the activation function.
5223 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5224 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5225 vget_low_u8(output_activation_min_vec));
5226 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5227 vget_low_u8(output_activation_max_vec));
5228
5229 vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
5230 vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5231 1);
5232
5233 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5234 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5235 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
5236
5237 output_data += output_depth;
5238 }
5239
5240 // output_width == four_over_stride.
5241 acc0 = adjusted_bias_data_s_0;
5242 acc1 = adjusted_bias_data_s_0;
5243
5244 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5245 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5246 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5247 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5248 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5249 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5250
5251 // Fixed-point multiplication.
5252 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5253 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5254 acc0, -output_shift);
5255 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5256 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5257 acc1, -output_shift);
5258 // Add the output offset.
5259 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5260 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5261 // Apply the activation function.
5262 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5263 acc_u8_0_1 =
5264 util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5265 acc_u8_0_1 =
5266 util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5267
5268 vst1_lane_8x4(output_data, acc_u8_0_1, 0);
5269 vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5270
5271 acc0 = adjusted_bias_data_s_1;
5272 acc1 = adjusted_bias_data_s_1;
5273
5274 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5275 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5276 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5277 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5278 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5279 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5280
5281 // Fixed-point multiplication.
5282 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5283 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5284 acc0, -output_shift);
5285 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5286 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5287 acc1, -output_shift);
5288 // Add the output offset.
5289 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5290 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5291 // Apply the activation function.
5292 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5293 acc_u8_0_1 =
5294 util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5295 acc_u8_0_1 =
5296 util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5297
5298 vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
5299 vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
5300
5301 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5302 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5303 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
5304
5305 output_data += output_depth;
5306 }
5307 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
5308 // output_width == 1.
5309 const int8* input_data = scratch_data + 4 + 4 * i_width;
5310
5311 // Load next sub-micro block of data.
5312 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5313 input_bank_a_reg = vld1q_lane_8x4(
5314 input_data + workspace_height_stride, input_bank_a_reg, 3);
5315 input_bank_b_reg = vld1q_lane_8x4(
5316 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5317 input_bank_b_reg = vld1q_lane_8x4(
5318 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5319 input_bank_c_reg = vld1q_lane_8x4(
5320 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5321
5322 int16x8_t acc_s16_0_1;
5323 uint8x8_t acc_u8_0_1;
5324 // Iterate over input width shifts within 4x4 blocks.
5325 {
5326 acc0 = adjusted_bias_data_s_0;
5327 acc1 = adjusted_bias_data_s_0;
5328
5329 acc0 =
5330 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5331 acc0 =
5332 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5333 acc0 =
5334 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5335 acc1 =
5336 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5337 acc1 =
5338 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5339 acc1 =
5340 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5341
5342 // Fixed-point multiplication.
5343 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5344 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5345 acc0, -output_shift);
5346 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5347 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5348 acc1, -output_shift);
5349 // Add the output offset.
5350 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5351 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5352 // Apply the activation function.
5353 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5354 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5355 vget_low_u8(output_activation_min_vec));
5356 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5357 vget_low_u8(output_activation_max_vec));
5358
5359 vst1_lane_8x4(output_data, acc_u8_0_1, 0);
5360 vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5361
5362 acc0 = adjusted_bias_data_s_1;
5363 acc1 = adjusted_bias_data_s_1;
5364
5365 acc0 =
5366 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5367 acc0 =
5368 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5369 acc0 =
5370 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5371 acc1 =
5372 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5373 acc1 =
5374 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5375 acc1 =
5376 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5377
5378 // Fixed-point multiplication.
5379 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5380 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5381 acc0, -output_shift);
5382 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5383 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5384 acc1, -output_shift);
5385 // Add the output offset.
5386 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5387 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5388 // Apply the activation function.
5389 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5390 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5391 vget_low_u8(output_activation_min_vec));
5392 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5393 vget_low_u8(output_activation_max_vec));
5394
5395 vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
5396 vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5397 1);
5398
5399 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5400 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5401 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
5402
5403 output_data += output_depth;
5404 }
5405 }
5406 } else {
5407 TFLITE_DCHECK_EQ(block_height, 1);
5408 // Work through one slice, by row, at a time.
5409 const int8* scratch_data = scratch_block_data;
5410 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5411 output_data = output_block_data + 8 * j_depth;
5412
5413 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
5414 int8x16_t input_bank_b_reg; // left 2, right 2, xxx, xxx.
5415
5416 // Load first sub-micro block of data into operational banks.
5417 input_bank_a_reg =
5418 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
5419 // uninitialized variable.
5420 input_bank_a_reg = vld1q_lane_8x4(
5421 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5422 input_bank_b_reg = vld1q_dup_s8x4(
5423 scratch_data +
5424 2 * workspace_height_stride); // Load lane 0, avoiding
5425 // uninitialized variable.
5426
5427 int32x4_t acc0;
5428 int32x4_t acc1;
5429
5430 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
5431 ++i_width) {
5432 const int output_width =
5433 i_width == output_width_micro_repeats ? residual_width : 2;
5434
5435 TFLITE_DCHECK_LE(output_width, 2);
5436 TFLITE_DCHECK_GE(output_width, 1);
5437 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
5438 const int8* input_data = scratch_data + 4 + 4 * i_width;
5439
5440 // Load next sub-micro block of data.
5441 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5442 input_bank_a_reg = vld1q_lane_8x4(
5443 input_data + workspace_height_stride, input_bank_a_reg, 3);
5444 input_bank_b_reg = vld1q_lane_8x4(
5445 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5446
5447 int16x8_t acc_s16_0_1;
5448 uint8x8_t acc_u8_0_1;
5449
5450 // Iterate over input width shifts within 4x4 blocks.
5451 {
5452 acc0 = adjusted_bias_data_s_0;
5453
5454 acc0 =
5455 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5456 acc0 =
5457 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5458 acc0 =
5459 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5460
5461 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5462 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5463 acc0, -output_shift);
5464
5465 // Second sub-block accumulation.
5466 acc1 = adjusted_bias_data_s_1;
5467
5468 acc1 =
5469 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5470 acc1 =
5471 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5472 acc1 =
5473 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5474
5475 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5476 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5477 acc1, -output_shift);
5478
5479 // Add the output offset.
5480 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5481 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5482 // Apply the activation function.
5483 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5484 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5485 vget_low_u8(output_activation_min_vec));
5486 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5487 vget_low_u8(output_activation_max_vec));
5488
5489 // This stores the results for both sub-blocks together.
5490 util_vst1_x8(output_data, acc_u8_0_1);
5491
5492 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5493 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5494
5495 output_data += output_depth;
5496 }
5497 if (output_width == 2) {
5498 acc0 = adjusted_bias_data_s_0;
5499
5500 acc0 =
5501 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5502 acc0 =
5503 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5504 acc0 =
5505 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5506
5507 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5508 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5509 acc0, -output_shift);
5510
5511 // Second sub-block accumulation.
5512 acc1 = adjusted_bias_data_s_1;
5513
5514 acc1 =
5515 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5516 acc1 =
5517 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5518 acc1 =
5519 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5520
5521 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5522 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5523 acc1, -output_shift);
5524
5525 // Add the output offset.
5526 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5527 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5528 // Apply the activation function.
5529 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5530 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5531 vget_low_u8(output_activation_min_vec));
5532 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5533 vget_low_u8(output_activation_max_vec));
5534
5535 // This stores the results for both sub-blocks together.
5536 util_vst1_x8(output_data, acc_u8_0_1);
5537
5538 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5539 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5540
5541 output_data += output_depth;
5542 }
5543 }
5544 }
5545 }
5546 }
5547
5548 static inline void Run(const int8* scratch_block_data,
5549 const int8* filter_workspace, const int32* bias_data,
5550 uint8* output_block_data,
5551 const DepthwiseConvDotProdParams* function_params) {
5552 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5553 output_block_data, function_params);
5554 }
5555 };
5556
5557 template <>
5558 struct KernelMacroBlock<
5559 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5560 QuantizationType::kPerChannelInt8,
5561 DepthwiseConvDepthMultiplication::kNoMultiplication,
5562 /*stride=*/1> {
5563 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
5564 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
5565 return vmin_s8(a, b);
5566 }
5567 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
5568 return vmax_s8(a, b);
5569 }
5570 static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
5571 return vminq_s8(a, b);
5572 }
5573 static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
5574 return vmaxq_s8(a, b);
5575 }
5576
5577 static inline void KernelMacroBlockIntrinsics(
5578 const int8* scratch_block_data, const int8* filter_workspace,
5579 const int32* bias_data, int8* output_block_data,
5580 const DepthwiseConvDotProdParams* function_params) {
5581 static constexpr QuantizationType quantization_type =
5582 QuantizationType::kPerChannelInt8;
5583
5584 const int workspace_height_stride =
5585 function_params->workspace_height_stride;
5586 const int input_width_overall_micro_repeats =
5587 function_params->input_width_overall_micro_repeats;
5588 const int output_width_micro_repeats =
5589 function_params->output_width_micro_repeats;
5590 const int depth_micro_repeats = function_params->depth_micro_repeats;
5591 const int depth = function_params->input_depth;
5592
5593 const int output_width_overall_micro_repeats =
5594 function_params->output_width_overall_micro_repeats;
5595 const int block_height = function_params->outbound_block_height;
5596 const int residual_width = function_params->output_residual_width;
5597 const int output_height_stride = function_params->output_height_stride;
5598 constexpr int kBiasIncrement = 4;
5599
5600 TFLITE_DCHECK(depth_micro_repeats > 0);
5601 const int width_micro_stride = 4 * 8;
5602 const int depth_micro_stride =
5603 width_micro_stride * input_width_overall_micro_repeats;
5604
5605 const int32 output_activation_min =
5606 function_params->quantized_activation_min;
5607 const int32 output_activation_max =
5608 function_params->quantized_activation_max;
5609 const int32 output_offset = function_params->output_offset;
5610 const int32* output_shift_per_channel =
5611 function_params->output_shift_per_channel;
5612 const int32* output_multiplier_per_channel =
5613 function_params->output_multiplier_per_channel;
5614 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5615 TFLITE_DCHECK_GE(output_activation_min, 0);
5616 TFLITE_DCHECK_LT(output_activation_min, 256);
5617 TFLITE_DCHECK_GE(output_activation_max, 0);
5618 TFLITE_DCHECK_LT(output_activation_max, 256);
5619 } else {
5620 TFLITE_DCHECK_GE(output_activation_min, -128);
5621 TFLITE_DCHECK_LT(output_activation_min, 128);
5622 TFLITE_DCHECK_GE(output_activation_max, -128);
5623 TFLITE_DCHECK_LT(output_activation_max, 128);
5624 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
5625 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
5626 }
5627 TFLITE_DCHECK_GE(output_offset, -32878);
5628 TFLITE_DCHECK_LT(output_offset, 32768);
5629
5630 const int16x8_t output_offset_vec =
5631 vdupq_n_s16(static_cast<int16>(output_offset));
5632 const int8x16_t output_activation_min_vec =
5633 vdupq_n_s8(static_cast<int8>(output_activation_min));
5634 const int8x16_t output_activation_max_vec =
5635 vdupq_n_s8(static_cast<int8>(output_activation_max));
5636
5637 const int8* input_data_depthwise = scratch_block_data;
5638 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5639 output_data_depthwise = output_block_data;
5640 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
5641 // Simulate NEON-register transposition of subset of filter.
5642 int8x16_t filter_reg_0_a;
5643 int8x16_t filter_reg_0_b;
5644 int8x16_t filter_reg_1_a;
5645 int8x16_t filter_reg_1_b;
5646 int8x16_t filter_reg_2_a;
5647 int8x16_t filter_reg_2_b;
5648 int8x16_t filter_reg_0_a_shifted;
5649 int8x16_t filter_reg_1_a_shifted;
5650 int8x16_t filter_reg_2_a_shifted;
5651
5652 filter_reg_0_a = vld1q_s8(filter_workspace);
5653 filter_workspace += 16;
5654 filter_reg_0_b = vld1q_s8(filter_workspace);
5655 filter_workspace += 16;
5656 filter_reg_1_a = vld1q_s8(filter_workspace);
5657 filter_workspace += 16;
5658 filter_reg_1_b = vld1q_s8(filter_workspace);
5659 filter_workspace += 16;
5660 filter_reg_2_a = vld1q_s8(filter_workspace);
5661 filter_workspace += 16;
5662 filter_reg_2_b = vld1q_s8(filter_workspace);
5663 filter_workspace += 16;
5664
5665 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
5666 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
5667 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
5668
5669 if (block_height == 4) {
5670 for (int s = 0; s < 2; ++s) {
5671 // Work through one slice, by row, at a time.
5672 const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
5673 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5674 output_data_base = output_data_depthwise + 4 * s;
5675
5676 const int8* next_input_data = input_data_base;
5677 typename QuantizationTypeImpl<quantization_type>::ExternalType*
5678 output_data = output_data_base;
5679
5680 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
5681 bias_data += kBiasIncrement;
5682
5683 const int32x4_t output_shift =
5684 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
5685 const int32x4_t output_multiplier =
5686 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
5687
5688 // Load first sub-micro block of data into operational banks.
5689 int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
5690 int8x16_t left_bank_1_reg =
5691 vld1q_s8(next_input_data + workspace_height_stride);
5692 int8x16_t left_bank_2_reg =
5693 vld1q_s8(next_input_data + 2 * workspace_height_stride);
5694 int8x16_t left_bank_3_reg =
5695 vld1q_s8(next_input_data + 3 * workspace_height_stride);
5696 int8x16_t left_bank_4_reg =
5697 vld1q_s8(next_input_data + 4 * workspace_height_stride);
5698 int8x16_t left_bank_5_reg =
5699 vld1q_s8(next_input_data + 5 * workspace_height_stride);
5700
5701 int32x4_t acc0;
5702 int32x4_t acc1;
5703 int32x4_t acc2;
5704 int32x4_t acc3;
5705
5706 acc0 = adjusted_bias_data;
5707 acc1 = adjusted_bias_data;
5708 acc2 = adjusted_bias_data;
5709 acc3 = adjusted_bias_data;
5710
5711 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5712 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5713 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5714 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5715
5716 for (int i_width = 0; i_width < output_width_micro_repeats;
5717 ++i_width) {
5718 next_input_data += width_micro_stride;
5719
5720 // Iterate over input width shifts within 4x4 blocks.
5721 {
5722 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5723 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5724 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5725 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5726 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5727 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5728 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5729 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5730
5731 // Fixed-point multiplication.
5732 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5733 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5734 acc0, output_shift);
5735 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5736 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5737 acc1, output_shift);
5738 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5739 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5740 acc2, output_shift);
5741 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5742 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5743 acc3, output_shift);
5744 // Add the output offset.
5745 int16x8_t acc_s16_0_1 =
5746 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5747 int16x8_t acc_s16_2_3 =
5748 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5749 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5750 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5751 // Apply the activation function.
5752 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5753 vqmovxn_s16(acc_s16_2_3));
5754 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5755 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5756
5757 vst1q_lane_8x4(output_data, acc_u8_all, 0);
5758 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5759 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5760 2);
5761 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5762 3);
5763
5764 output_data += depth;
5765 }
5766
5767 // Load next sub-micro block of data.
5768 int8x16_t right_bank_0_reg;
5769 int8x16_t right_bank_1_reg;
5770 int8x16_t right_bank_2_reg;
5771 int8x16_t right_bank_3_reg;
5772 int8x16_t right_bank_4_reg;
5773 int8x16_t right_bank_5_reg;
5774
5775 // Loading of next block always valid.
5776 right_bank_0_reg = vld1q_s8(next_input_data);
5777 right_bank_1_reg =
5778 vld1q_s8(next_input_data + workspace_height_stride);
5779 right_bank_2_reg =
5780 vld1q_s8(next_input_data + 2 * workspace_height_stride);
5781 right_bank_3_reg =
5782 vld1q_s8(next_input_data + 3 * workspace_height_stride);
5783 right_bank_4_reg =
5784 vld1q_s8(next_input_data + 4 * workspace_height_stride);
5785 right_bank_5_reg =
5786 vld1q_s8(next_input_data + 5 * workspace_height_stride);
5787
5788 {
5789 acc0 = adjusted_bias_data;
5790 acc1 = adjusted_bias_data;
5791 acc2 = adjusted_bias_data;
5792 acc3 = adjusted_bias_data;
5793
5794 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
5795 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
5796 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
5797 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
5798 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
5799 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
5800 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
5801 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
5802 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
5803 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
5804 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
5805 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
5806
5807 // Fixed-point multiplication.
5808 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5809 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5810 acc0, output_shift);
5811 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5812 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5813 acc1, output_shift);
5814 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5815 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5816 acc2, output_shift);
5817 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5818 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5819 acc3, output_shift);
5820 // Add the output offset.
5821 int16x8_t acc_s16_0_1 =
5822 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5823 int16x8_t acc_s16_2_3 =
5824 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5825 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5826 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5827 // Apply the activation function.
5828 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5829 vqmovxn_s16(acc_s16_2_3));
5830 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5831 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5832
5833 vst1q_lane_8x4(output_data, acc_u8_all, 0);
5834 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5835 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5836 2);
5837 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5838 3);
5839
5840 left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
5841 left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
5842 left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
5843 left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
5844 left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
5845 left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
5846 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
5847 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
5848 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
5849 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
5850 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
5851 vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
5852
5853 output_data += depth;
5854 }
5855
5856 {
5857 acc0 = adjusted_bias_data;
5858 acc1 = adjusted_bias_data;
5859 acc2 = adjusted_bias_data;
5860 acc3 = adjusted_bias_data;
5861
5862 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5863 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5864 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5865 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5866 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5867 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5868 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5869 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5870 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5871 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5872 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5873 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5874
5875 // Fixed-point multiplication.
5876 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5877 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5878 acc0, output_shift);
5879 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5880 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5881 acc1, output_shift);
5882 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5883 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5884 acc2, output_shift);
5885 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5886 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5887 acc3, output_shift);
5888 // Add the output offset.
5889 int16x8_t acc_s16_0_1 =
5890 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5891 int16x8_t acc_s16_2_3 =
5892 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5893 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5894 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5895 // Apply the activation function.
5896 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5897 vqmovxn_s16(acc_s16_2_3));
5898 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5899 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5900
5901 vst1q_lane_8x4(output_data, acc_u8_all, 0);
5902 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5903 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5904 2);
5905 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5906 3);
5907
5908 output_data += depth;
5909 }
5910
5911 {
5912 acc0 = adjusted_bias_data;
5913 acc1 = adjusted_bias_data;
5914 acc2 = adjusted_bias_data;
5915 acc3 = adjusted_bias_data;
5916
5917 acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
5918 acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
5919 acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
5920 acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
5921 acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
5922 acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
5923 acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
5924 acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
5925 acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
5926 acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
5927 acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
5928 acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
5929
5930 // Fixed-point multiplication.
5931 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5932 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5933 acc0, output_shift);
5934 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5935 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5936 acc1, output_shift);
5937 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5938 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5939 acc2, output_shift);
5940 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5941 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5942 acc3, output_shift);
5943 // Add the output offset.
5944 int16x8_t acc_s16_0_1 =
5945 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5946 int16x8_t acc_s16_2_3 =
5947 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5948 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5949 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5950 // Apply the activation function.
5951 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5952 vqmovxn_s16(acc_s16_2_3));
5953 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5954 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5955
5956 vst1q_lane_8x4(output_data, acc_u8_all, 0);
5957 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5958 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5959 2);
5960 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5961 3);
5962
5963 left_bank_0_reg = right_bank_0_reg;
5964 left_bank_1_reg = right_bank_1_reg;
5965 left_bank_2_reg = right_bank_2_reg;
5966 left_bank_3_reg = right_bank_3_reg;
5967 left_bank_4_reg = right_bank_4_reg;
5968 left_bank_5_reg = right_bank_5_reg;
5969
5970 output_data += depth;
5971 acc0 = adjusted_bias_data;
5972 acc1 = adjusted_bias_data;
5973 acc2 = adjusted_bias_data;
5974 acc3 = adjusted_bias_data;
5975
5976 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5977 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5978 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5979 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5980 }
5981 }
5982
5983 if (residual_width > 0) {
5984 next_input_data += width_micro_stride;
5985 const int output_width = residual_width;
5986
5987 // Load next sub-micro block of data.
5988 int8x16_t right_bank_0_reg;
5989 int8x16_t right_bank_1_reg;
5990 int8x16_t right_bank_2_reg;
5991 int8x16_t right_bank_3_reg;
5992 int8x16_t right_bank_4_reg;
5993 int8x16_t right_bank_5_reg;
5994 // Logic: (output_width - 1) * stride_val < 2.
5995 const bool no_right_block = output_width < 3;
5996
5997 if (no_right_block) {
5998 // Only needed for sanitizer checks.
5999 right_bank_0_reg = vdupq_n_s8(0);
6000 right_bank_1_reg = vdupq_n_s8(0);
6001 right_bank_2_reg = vdupq_n_s8(0);
6002 right_bank_3_reg = vdupq_n_s8(0);
6003 right_bank_4_reg = vdupq_n_s8(0);
6004 right_bank_5_reg = vdupq_n_s8(0);
6005 } else {
6006 right_bank_0_reg = vld1q_s8(next_input_data);
6007 right_bank_1_reg =
6008 vld1q_s8(next_input_data + workspace_height_stride);
6009 right_bank_2_reg =
6010 vld1q_s8(next_input_data + 2 * workspace_height_stride);
6011 right_bank_3_reg =
6012 vld1q_s8(next_input_data + 3 * workspace_height_stride);
6013 right_bank_4_reg =
6014 vld1q_s8(next_input_data + 4 * workspace_height_stride);
6015 right_bank_5_reg =
6016 vld1q_s8(next_input_data + 5 * workspace_height_stride);
6017 }
6018
6019 // Iterate over input width shifts within 4x4 blocks.
6020 for (int x = 0; x < output_width; ++x) {
6021 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6022 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6023 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
6024 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
6025 acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
6026 acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
6027 acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
6028 acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
6029
6030 // Fixed-point multiplication.
6031 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6032 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6033 acc0, output_shift);
6034 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6035 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6036 acc1, output_shift);
6037 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6038 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6039 acc2, output_shift);
6040 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6041 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6042 acc3, output_shift);
6043 // Add the output offset.
6044 int16x8_t acc_s16_0_1 =
6045 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6046 int16x8_t acc_s16_2_3 =
6047 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6048 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6049 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6050 // Apply the activation function.
6051 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
6052 vqmovxn_s16(acc_s16_2_3));
6053 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6054 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6055
6056 vst1q_lane_8x4(output_data, acc_u8_all, 0);
6057 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
6058 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
6059 2);
6060 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
6061 3);
6062
6063 biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
6064 biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
6065 biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
6066 biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
6067 biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
6068 biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
6069
6070 output_data += depth;
6071
6072 acc0 = adjusted_bias_data;
6073 acc1 = adjusted_bias_data;
6074 acc2 = adjusted_bias_data;
6075 acc3 = adjusted_bias_data;
6076
6077 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6078 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
6079 acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
6080 acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
6081 }
6082 }
6083 input_data_base += 4 * workspace_height_stride;
6084 output_data_base += 4 * output_height_stride;
6085
6086 // Move to next sub-block: advance to second set of filters, to new
6087 // bias.
6088 filter_reg_0_a = filter_reg_0_b;
6089 filter_reg_1_a = filter_reg_1_b;
6090 filter_reg_2_a = filter_reg_2_b;
6091 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
6092 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
6093 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
6094 }
6095 } else {
6096 const int8* input_data_base = input_data_depthwise;
6097 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6098 output_data_base = output_data_depthwise;
6099
6100 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6101 bias_data += kBiasIncrement;
6102 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6103 bias_data += kBiasIncrement;
6104
6105 const int32x4_t output_shift_a =
6106 vld1q_s32(output_shift_per_channel + j_depth * 8);
6107 const int32x4_t output_multiplier_a =
6108 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6109 const int32x4_t output_shift_b =
6110 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6111 const int32x4_t output_multiplier_b =
6112 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6113
6114 for (int k_height = 0; k_height < block_height; ++k_height) {
6115 const int8* next_input_data = input_data_base;
6116 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6117 output_data = output_data_base;
6118
6119 // Load first sub-micro block of data into operational banks.
6120 int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
6121 int8x16_t left_bank_1_reg_a =
6122 vld1q_s8(next_input_data + workspace_height_stride);
6123 int8x16_t left_bank_2_reg_a =
6124 vld1q_s8(next_input_data + 2 * workspace_height_stride);
6125 int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6126 int8x16_t left_bank_1_reg_b =
6127 vld1q_s8(next_input_data + workspace_height_stride + 16);
6128 int8x16_t left_bank_2_reg_b =
6129 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6130
6131 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6132 ++i_width) {
6133 next_input_data += width_micro_stride;
6134 const int output_width =
6135 i_width == output_width_micro_repeats ? residual_width : 4;
6136
6137 int8x16_t right_bank_0_reg_a;
6138 int8x16_t right_bank_1_reg_a;
6139 int8x16_t right_bank_2_reg_a;
6140 int8x16_t right_bank_0_reg_b;
6141 int8x16_t right_bank_1_reg_b;
6142 int8x16_t right_bank_2_reg_b;
6143 // Logic: (output_width - 1) * stride_val < 2.
6144 const bool no_right_block = output_width < 3;
6145
6146 // Load next sub-micro block of data.
6147 if (no_right_block) {
6148 // Only needed for sanitizer checks.
6149 right_bank_0_reg_a = vdupq_n_s8(0);
6150 right_bank_1_reg_a = vdupq_n_s8(0);
6151 right_bank_2_reg_a = vdupq_n_s8(0);
6152 right_bank_0_reg_b = vdupq_n_s8(0);
6153 right_bank_1_reg_b = vdupq_n_s8(0);
6154 right_bank_2_reg_b = vdupq_n_s8(0);
6155 } else {
6156 right_bank_0_reg_a = vld1q_s8(next_input_data);
6157 right_bank_1_reg_a =
6158 vld1q_s8(next_input_data + workspace_height_stride);
6159 right_bank_2_reg_a =
6160 vld1q_s8(next_input_data + 2 * workspace_height_stride);
6161 right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6162 right_bank_1_reg_b =
6163 vld1q_s8(next_input_data + workspace_height_stride + 16);
6164 right_bank_2_reg_b =
6165 vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6166 }
6167
6168 // Iterate over input width shifts within 4x4 blocks.
6169 for (int x = 0; x < output_width; ++x) {
6170 int32x4_t acc_a = adjusted_bias_data_a;
6171 int32x4_t acc_b = adjusted_bias_data_b;
6172 acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
6173 acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
6174 acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
6175 acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
6176 acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
6177 acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
6178
6179 // Fixed-point multiplication.
6180 acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
6181 acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
6182 acc_a =
6183 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6184 acc_a, output_shift_a);
6185 acc_b =
6186 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6187 acc_b, output_shift_b);
6188 // Add the output offset.
6189 int16x8_t acc_s16_0_0 =
6190 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
6191 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
6192 // Apply the activation function.
6193 int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
6194 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
6195 vget_low_s8(output_activation_min_vec));
6196 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
6197 vget_low_s8(output_activation_max_vec));
6198
6199 vst1_s8(output_data, acc_u8_0_0);
6200
6201 biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
6202 biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
6203 biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
6204 biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
6205 biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
6206 biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
6207
6208 output_data += depth;
6209 }
6210 }
6211 input_data_base += workspace_height_stride;
6212 output_data_base += output_height_stride;
6213 }
6214 }
6215 input_data_depthwise += depth_micro_stride;
6216 output_data_depthwise += 8;
6217 }
6218 } // NOLINT(readability/fn_size) Manually unrolled.
6219
6220 static inline void Run(const int8* scratch_block_data,
6221 const int8* filter_workspace, const int32* bias_data,
6222 int8* output_block_data,
6223 const DepthwiseConvDotProdParams* function_params) {
6224 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6225 output_block_data, function_params);
6226 }
6227 };
6228
6229 template <>
6230 struct KernelMacroBlock<
6231 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6232 QuantizationType::kPerChannelInt8,
6233 DepthwiseConvDepthMultiplication::kNoMultiplication,
6234 /*stride=*/2> {
6235 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6236 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6237 return vmin_s8(a, b);
6238 }
6239 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6240 return vmax_s8(a, b);
6241 }
6242
6243 static inline void KernelMacroBlockIntrinsics(
6244 const int8* scratch_block_data, const int8* filter_workspace,
6245 const int32* bias_data, int8* output_block_data,
6246 const DepthwiseConvDotProdParams* function_params) {
6247 static constexpr QuantizationType quantization_type =
6248 QuantizationType::kPerChannelInt8;
6249
6250 const int workspace_height_stride =
6251 function_params->workspace_height_stride;
6252 const int input_width_overall_micro_repeats =
6253 function_params->input_width_overall_micro_repeats;
6254 const int output_width_micro_repeats =
6255 function_params->output_width_micro_repeats;
6256 const int depth_micro_repeats = function_params->depth_micro_repeats;
6257 const int depth = function_params->input_depth;
6258 constexpr int kStrideVal = 2;
6259 constexpr int kFourOverStride = 2;
6260 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
6261 TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
6262
6263 const int workspace_width_micro_repeats =
6264 function_params->workspace_width_micro_repeats;
6265 const int output_width_overall_micro_repeats =
6266 function_params->output_width_overall_micro_repeats;
6267 const int block_height = function_params->outbound_block_height;
6268 const int residual_width = function_params->output_residual_width;
6269 const int output_height_stride = function_params->output_height_stride;
6270 constexpr int kBiasIncrement = 4;
6271
6272 TFLITE_DCHECK(depth_micro_repeats > 0);
6273 const int width_micro_stride = 4 * 8;
6274 const int depth_micro_stride =
6275 width_micro_stride * input_width_overall_micro_repeats;
6276
6277 const int32 output_activation_min =
6278 function_params->quantized_activation_min;
6279 const int32 output_activation_max =
6280 function_params->quantized_activation_max;
6281 const int32 output_offset = function_params->output_offset;
6282 const int32* output_shift_per_channel =
6283 function_params->output_shift_per_channel;
6284 const int32* output_multiplier_per_channel =
6285 function_params->output_multiplier_per_channel;
6286 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6287 TFLITE_DCHECK_GE(output_activation_min, 0);
6288 TFLITE_DCHECK_LT(output_activation_min, 256);
6289 TFLITE_DCHECK_GE(output_activation_max, 0);
6290 TFLITE_DCHECK_LT(output_activation_max, 256);
6291 } else {
6292 TFLITE_DCHECK_GE(output_activation_min, -128);
6293 TFLITE_DCHECK_LT(output_activation_min, 128);
6294 TFLITE_DCHECK_GE(output_activation_max, -128);
6295 TFLITE_DCHECK_LT(output_activation_max, 128);
6296 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6297 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6298 }
6299 TFLITE_DCHECK_GE(output_offset, -32878);
6300 TFLITE_DCHECK_LT(output_offset, 32768);
6301
6302 // This version only does min/max on 64 bits.
6303 const int16x8_t output_offset_vec =
6304 vdupq_n_s16(static_cast<int16>(output_offset));
6305 const int8x8_t output_activation_min_vec =
6306 vdup_n_s8(static_cast<int8>(output_activation_min));
6307 const int8x8_t output_activation_max_vec =
6308 vdup_n_s8(static_cast<int8>(output_activation_max));
6309
6310 constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
6311
6312 TFLITE_DCHECK_LE(block_height, 2);
6313
6314 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6315 const int8* filter_block =
6316 filter_workspace + shuffled_filter_increment * j_depth;
6317
6318 if (block_height == 2) {
6319 for (int s = 0; s < 2; ++s) {
6320 // Simulate NEON-register transposition of subset of filter.
6321 int8x16_t filter_reg_0_a;
6322 int8x16_t filter_reg_1_a;
6323 int8x16_t filter_reg_2_a;
6324
6325 filter_reg_0_a = vld1q_s8(filter_block + s * 16);
6326 filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
6327 filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
6328
6329 const int8* scratch_data =
6330 scratch_block_data + depth_micro_stride * j_depth;
6331 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6332 output_data = output_block_data + 8 * j_depth;
6333 const int8* input_data_0 = scratch_data + s * 2 * 8;
6334
6335 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6336
6337 const int32x4_t output_shift =
6338 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6339 const int32x4_t output_multiplier =
6340 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6341
6342 // Load first sub-micro block of data into operational banks.
6343 int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
6344 int8x16_t left_bank_1_reg =
6345 vld1q_s8(input_data_0 + workspace_height_stride);
6346 int8x16_t left_bank_2_reg =
6347 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6348 int8x16_t left_bank_3_reg =
6349 vld1q_s8(input_data_0 + 3 * workspace_height_stride);
6350 int8x16_t left_bank_4_reg =
6351 vld1q_s8(input_data_0 + 4 * workspace_height_stride);
6352
6353 int8x16_t right_bank_0_reg;
6354 int8x16_t right_bank_1_reg;
6355 int8x16_t right_bank_2_reg;
6356 int8x16_t right_bank_3_reg;
6357 int8x16_t right_bank_4_reg;
6358
6359 int32x4_t acc0;
6360 int32x4_t acc1;
6361 int16x8_t acc_s16_0_1;
6362 int8x8_t acc_u8;
6363
6364 int i_width = 0;
6365
6366 // When output_width_micro_repeats <
6367 // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
6368 // residual_width == 1 is then true iff residual_width < 2.
6369 const int adjusted_width_micro_repeats =
6370 (output_width_micro_repeats <
6371 output_width_overall_micro_repeats) &&
6372 (residual_width == 1)
6373 ? output_width_micro_repeats
6374 : output_width_overall_micro_repeats;
6375
6376 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
6377 const int output_width = kFourOverStride;
6378 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6379 const int8* input_data =
6380 input_data_0 + width_micro_stride * i_width;
6381 acc0 = adjusted_bias_data;
6382 acc1 = adjusted_bias_data;
6383 right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
6384 right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
6385 workspace_height_stride);
6386
6387 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6388 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6389 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6390 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6391
6392 right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
6393 2 * workspace_height_stride);
6394 right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
6395 3 * workspace_height_stride);
6396 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6397 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6398 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6399 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6400 right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
6401 4 * workspace_height_stride);
6402
6403 // Fixed-point multiplication.
6404 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6405 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6406 acc0, output_shift);
6407 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6408 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6409 acc1, output_shift);
6410 // Add the output offset.
6411 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6412 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6413 // Apply the activation function.
6414 acc_u8 = vqmovxn_s16(acc_s16_0_1);
6415 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6416 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6417
6418 left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
6419 left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
6420 left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
6421 left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
6422 left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
6423 acc0 = adjusted_bias_data;
6424 acc1 = adjusted_bias_data;
6425 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6426 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6427 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6428 vst1_lane_8x4(output_data_base, acc_u8, 0);
6429 vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
6430
6431 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6432 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6433
6434 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6435 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6436 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6437 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6438 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6439 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6440
6441 // Fixed-point multiplication.
6442 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6443 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6444 acc0, output_shift);
6445 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6446 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6447 acc1, output_shift);
6448 // Add the output offset.
6449 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6450 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6451 // Apply the activation function.
6452 acc_u8 = vqmovxn_s16(acc_s16_0_1);
6453 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6454 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6455
6456 vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
6457 vst1_lane_8x4(output_data_base + depth + output_height_stride,
6458 acc_u8, 1);
6459
6460 left_bank_0_reg = right_bank_0_reg;
6461 left_bank_1_reg = right_bank_1_reg;
6462 left_bank_2_reg = right_bank_2_reg;
6463 left_bank_3_reg = right_bank_3_reg;
6464 left_bank_4_reg = right_bank_4_reg;
6465 }
6466 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
6467 TFLITE_DCHECK_NE(residual_width, kFourOverStride);
6468
6469 // No need to load next ("right") block of data.
6470
6471 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6472 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6473
6474 // Iterate over input width shifts within 4x4 blocks.
6475 {
6476 acc0 = adjusted_bias_data;
6477 acc1 = adjusted_bias_data;
6478
6479 acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6480 acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6481 acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6482 acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6483 acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6484 acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6485
6486 // Fixed-point multiplication.
6487 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6488 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6489 acc0, output_shift);
6490 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6491 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6492 acc1, output_shift);
6493 // Add the output offset.
6494 int16x8_t acc_s16_0_1 =
6495 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6496 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6497 // Apply the activation function.
6498 int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6499 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6500 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6501
6502 vst1_lane_8x4(output_data_base, acc_u8, 0);
6503 vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
6504
6505 left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
6506 left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
6507 left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
6508 left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
6509 left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
6510 vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6511 vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6512 vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6513 vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6514 vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6515 }
6516 }
6517 bias_data += kBiasIncrement;
6518 }
6519 } else {
6520 // block_height == 1.
6521 int8x16_t filter_reg_0_a;
6522 int8x16_t filter_reg_1_a;
6523 int8x16_t filter_reg_2_a;
6524 int8x16_t filter_reg_0_b;
6525 int8x16_t filter_reg_1_b;
6526 int8x16_t filter_reg_2_b;
6527
6528 filter_reg_0_a = vld1q_s8(filter_block);
6529 filter_reg_1_a = vld1q_s8(filter_block + 32);
6530 filter_reg_2_a = vld1q_s8(filter_block + 64);
6531 filter_reg_0_b = vld1q_s8(filter_block + 16);
6532 filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
6533 filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
6534
6535 const int8* scratch_data =
6536 scratch_block_data + depth_micro_stride * j_depth;
6537 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6538 output_data = output_block_data + 8 * j_depth;
6539 const int8* input_data_0 = scratch_data;
6540
6541 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6542 bias_data += kBiasIncrement;
6543 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6544 bias_data += kBiasIncrement;
6545
6546 const int32x4_t output_shift_a =
6547 vld1q_s32(output_shift_per_channel + j_depth * 8);
6548 const int32x4_t output_multiplier_a =
6549 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6550 const int32x4_t output_shift_b =
6551 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6552 const int32x4_t output_multiplier_b =
6553 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6554
6555 // Load first sub-micro block of data into operational banks.
6556 int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
6557 int8x16_t left_bank_1_reg_a =
6558 vld1q_s8(input_data_0 + workspace_height_stride);
6559 int8x16_t left_bank_2_reg_a =
6560 vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6561 int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
6562 int8x16_t left_bank_1_reg_b =
6563 vld1q_s8(input_data_0 + workspace_height_stride + 16);
6564 int8x16_t left_bank_2_reg_b =
6565 vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
6566
6567 int8x16_t right_bank_0_reg_a;
6568 int8x16_t right_bank_1_reg_a;
6569 int8x16_t right_bank_2_reg_a;
6570 int8x16_t right_bank_0_reg_b;
6571 int8x16_t right_bank_1_reg_b;
6572 int8x16_t right_bank_2_reg_b;
6573
6574 int32x4_t acc0_a;
6575 int32x4_t acc0_b;
6576
6577 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6578 ++i_width) {
6579 const int output_width = i_width == output_width_micro_repeats
6580 ? residual_width
6581 : kFourOverStride;
6582 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6583 const int8* input_data = input_data_0 + width_micro_stride * i_width;
6584 const bool no_right_block = i_width == output_width_micro_repeats &&
6585 output_width_overall_micro_repeats ==
6586 workspace_width_micro_repeats;
6587
6588 if (!no_right_block) {
6589 // Load next sub-micro block of data.
6590 right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
6591 right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
6592 workspace_height_stride);
6593 right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
6594 2 * workspace_height_stride);
6595 right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
6596 right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
6597 workspace_height_stride + 16);
6598 right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
6599 2 * workspace_height_stride + 16);
6600 }
6601
6602 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6603 output_data_base = output_data + depth * 2 * i_width;
6604
6605 // Iterate over input width shifts within 4x4 blocks.
6606 {
6607 acc0_a = adjusted_bias_data_a;
6608 acc0_b = adjusted_bias_data_b;
6609
6610 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6611 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6612 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6613 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6614 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6615 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6616
6617 // Fixed-point multiplication.
6618 acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6619 acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6620 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6621 acc0_a, output_shift_a);
6622 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6623 acc0_b, output_shift_b);
6624 // Add the output offset.
6625 int16x8_t acc_s16_0_1 =
6626 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6627 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6628 // Apply the activation function.
6629 int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6630 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6631 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6632
6633 vst1_s8(output_data_base, acc_u8);
6634
6635 left_bank_0_reg_a = vrev32q_u16(left_bank_0_reg_a);
6636 left_bank_1_reg_a = vrev32q_u16(left_bank_1_reg_a);
6637 left_bank_2_reg_a = vrev32q_u16(left_bank_2_reg_a);
6638 left_bank_0_reg_b = vrev32q_u16(left_bank_0_reg_b);
6639 left_bank_1_reg_b = vrev32q_u16(left_bank_1_reg_b);
6640 left_bank_2_reg_b = vrev32q_u16(left_bank_2_reg_b);
6641 vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
6642 vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
6643 vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
6644 vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
6645 vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
6646 vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
6647 }
6648
6649 if (output_width > 1) {
6650 acc0_a = adjusted_bias_data_a;
6651 acc0_b = adjusted_bias_data_b;
6652
6653 acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6654 acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6655 acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6656 acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6657 acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6658 acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6659
6660 // Fixed-point multiplication.
6661 acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6662 acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6663 acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6664 acc0_a, output_shift_a);
6665 acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6666 acc0_b, output_shift_b);
6667 // Add the output offset.
6668 int16x8_t acc_s16_0_1 =
6669 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6670 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6671 // Apply the activation function.
6672 int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6673 acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6674 acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6675
6676 vst1_s8(output_data_base + depth, acc_u8);
6677
6678 left_bank_0_reg_a = right_bank_0_reg_a;
6679 left_bank_1_reg_a = right_bank_1_reg_a;
6680 left_bank_2_reg_a = right_bank_2_reg_a;
6681 left_bank_0_reg_b = right_bank_0_reg_b;
6682 left_bank_1_reg_b = right_bank_1_reg_b;
6683 left_bank_2_reg_b = right_bank_2_reg_b;
6684 }
6685 }
6686 }
6687 }
6688 } // NOLINT(readability/fn_size) Manually unrolled.
6689
6690 static inline void Run(const int8* scratch_block_data,
6691 const int8* filter_workspace, const int32* bias_data,
6692 int8* output_block_data,
6693 const DepthwiseConvDotProdParams* function_params) {
6694 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6695 output_block_data, function_params);
6696 }
6697 };
6698
6699 template <>
6700 struct KernelMacroBlock<
6701 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6702 QuantizationType::kPerChannelInt8,
6703 DepthwiseConvDepthMultiplication::kUnitInputDepth,
6704 /*stride=*/1> {
6705 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6706 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6707 return vmin_s8(a, b);
6708 }
6709 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6710 return vmax_s8(a, b);
6711 }
6712 static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
6713 return vminq_s8(a, b);
6714 }
6715 static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
6716 return vmaxq_s8(a, b);
6717 }
6718
6719 static inline void KernelMacroBlockIntrinsics(
6720 const int8* scratch_block_data, const int8* filter_workspace,
6721 const int32* bias_data, int8* output_block_data,
6722 const DepthwiseConvDotProdParams* function_params) {
6723 static constexpr QuantizationType quantization_type =
6724 QuantizationType::kPerChannelInt8;
6725
6726 TFLITE_DCHECK_EQ(function_params->stride, 1);
6727 const int workspace_height_stride =
6728 function_params->workspace_height_stride;
6729 const int output_width_micro_repeats =
6730 function_params->output_width_micro_repeats;
6731 const int depth_micro_repeats = function_params->depth_micro_repeats;
6732 const int output_depth = function_params->output_depth;
6733
6734 const int output_width_overall_micro_repeats =
6735 function_params->output_width_overall_micro_repeats;
6736 const int block_height = function_params->outbound_block_height;
6737 const int residual_width = function_params->output_residual_width;
6738 const int output_height_stride = function_params->output_height_stride;
6739 constexpr int kBiasIncrement = 4;
6740
6741 TFLITE_DCHECK(depth_micro_repeats > 0);
6742
6743 const int32 output_activation_min =
6744 function_params->quantized_activation_min;
6745 const int32 output_activation_max =
6746 function_params->quantized_activation_max;
6747 const int32 output_offset = function_params->output_offset;
6748 const int32* output_shift_per_channel =
6749 function_params->output_shift_per_channel;
6750 const int32* output_multiplier_per_channel =
6751 function_params->output_multiplier_per_channel;
6752 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6753 TFLITE_DCHECK_GE(output_activation_min, 0);
6754 TFLITE_DCHECK_LT(output_activation_min, 256);
6755 TFLITE_DCHECK_GE(output_activation_max, 0);
6756 TFLITE_DCHECK_LT(output_activation_max, 256);
6757 } else {
6758 TFLITE_DCHECK_GE(output_activation_min, -128);
6759 TFLITE_DCHECK_LT(output_activation_min, 128);
6760 TFLITE_DCHECK_GE(output_activation_max, -128);
6761 TFLITE_DCHECK_LT(output_activation_max, 128);
6762 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6763 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6764 }
6765 TFLITE_DCHECK_GE(output_offset, -32878);
6766 TFLITE_DCHECK_LT(output_offset, 32768);
6767
6768 const int16x8_t output_offset_vec =
6769 vdupq_n_s16(static_cast<int16>(output_offset));
6770 const int8x16_t output_activation_min_vec =
6771 vdupq_n_s8(static_cast<int8>(output_activation_min));
6772 const int8x16_t output_activation_max_vec =
6773 vdupq_n_s8(static_cast<int8>(output_activation_max));
6774
6775 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6776 output_data_depthwise = output_block_data;
6777 for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6778 // Simulate NEON-register transposition of subset of filter.
6779 int8x16_t filter_reg_0_a;
6780 int8x16_t filter_reg_0_b;
6781 int8x16_t filter_reg_1_a;
6782 int8x16_t filter_reg_1_b;
6783 int8x16_t filter_reg_2_a;
6784 int8x16_t filter_reg_2_b;
6785 int8x16_t filter_reg_0_a_shifted;
6786 int8x16_t filter_reg_1_a_shifted;
6787 int8x16_t filter_reg_2_a_shifted;
6788
6789 filter_reg_0_a = vld1q_s8(filter_workspace);
6790 filter_workspace += 16;
6791 filter_reg_0_b = vld1q_s8(filter_workspace);
6792 filter_workspace += 16;
6793 filter_reg_1_a = vld1q_s8(filter_workspace);
6794 filter_workspace += 16;
6795 filter_reg_1_b = vld1q_s8(filter_workspace);
6796 filter_workspace += 16;
6797 filter_reg_2_a = vld1q_s8(filter_workspace);
6798 filter_workspace += 16;
6799 filter_reg_2_b = vld1q_s8(filter_workspace);
6800 filter_workspace += 16;
6801
6802 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
6803 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
6804 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
6805
6806 // When output_width_micro_repeats < output_width_overall_micro_repeats,
6807 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
6808 // residual_width < 2.
6809 const int adjusted_width_micro_repeats =
6810 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
6811 (residual_width < 4)
6812 ? output_width_micro_repeats
6813 : output_width_overall_micro_repeats;
6814
6815 if (block_height == 4) {
6816 for (int s = 0; s < 2; ++s) {
6817 // Work through one slice, by row, at a time.
6818 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6819 output_data_base = output_data_depthwise + 4 * s;
6820
6821 const int8* next_input_data = scratch_block_data;
6822 typename QuantizationTypeImpl<quantization_type>::ExternalType*
6823 output_data = output_data_base;
6824
6825 const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6826 bias_data += kBiasIncrement;
6827
6828 const int32x4_t output_shift =
6829 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6830 const int32x4_t output_multiplier =
6831 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6832
6833 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
6834 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
6835 int8x16_t input_bank_c_reg; // left 4, right 4, left 5, right 5.
6836
6837 // Load first sub-micro block of data into operational banks.
6838 input_bank_a_reg =
6839 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
6840 // uninitialized variable.
6841 input_bank_a_reg = vld1q_lane_8x4(
6842 next_input_data + workspace_height_stride, input_bank_a_reg, 2);
6843 input_bank_b_reg = vld1q_dup_s8x4(
6844 next_input_data +
6845 2 * workspace_height_stride); // Load lane 0, avoiding
6846 // uninitialized variable.
6847 input_bank_b_reg =
6848 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
6849 input_bank_b_reg, 2);
6850 input_bank_c_reg = vld1q_dup_s8x4(
6851 next_input_data +
6852 4 * workspace_height_stride); // Load lane 0, avoiding
6853 // uninitialized variable.
6854 input_bank_c_reg =
6855 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
6856 input_bank_c_reg, 2);
6857
6858 int32x4_t acc0;
6859 int32x4_t acc1;
6860 int32x4_t acc2;
6861 int32x4_t acc3;
6862
6863 acc0 = adjusted_bias_data;
6864 acc1 = adjusted_bias_data;
6865 acc2 = adjusted_bias_data;
6866 acc3 = adjusted_bias_data;
6867
6868 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
6869 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
6870 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
6871 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
6872
6873 int i_width = 0;
6874 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
6875 next_input_data += 4;
6876
6877 // Iterate over input width shifts within 4x4 blocks.
6878 {
6879 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
6880 0);
6881 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
6882 2);
6883 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
6884 2);
6885 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
6886 2);
6887 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
6888 2);
6889 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
6890 0);
6891 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
6892 0);
6893 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
6894 2);
6895
6896 // Fixed-point multiplication.
6897 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6898 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6899 acc0, output_shift);
6900 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6901 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6902 acc1, output_shift);
6903 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6904 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6905 acc2, output_shift);
6906 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6907 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6908 acc3, output_shift);
6909 // Add the output offset.
6910 int16x8_t acc_s16_0_1 =
6911 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6912 int16x8_t acc_s16_2_3 =
6913 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6914 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6915 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6916 // Apply the activation function.
6917 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
6918 vqmovxn_s16(acc_s16_2_3));
6919 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6920 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6921
6922 vst1q_lane_8x4(output_data, acc_u8_all, 0);
6923 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
6924 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
6925 2);
6926 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
6927 3);
6928
6929 output_data += output_depth;
6930 }
6931 // Load next sub-micro block of data.
6932 input_bank_a_reg =
6933 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
6934 input_bank_a_reg = vld1q_lane_8x4(
6935 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
6936 input_bank_b_reg =
6937 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
6938 input_bank_b_reg, 1);
6939 input_bank_b_reg =
6940 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
6941 input_bank_b_reg, 3);
6942 input_bank_c_reg =
6943 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
6944 input_bank_c_reg, 1);
6945 input_bank_c_reg =
6946 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
6947 input_bank_c_reg, 3);
6948
6949 {
6950 acc0 = adjusted_bias_data;
6951 acc1 = adjusted_bias_data;
6952 acc2 = adjusted_bias_data;
6953 acc3 = adjusted_bias_data;
6954
6955 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
6956 input_bank_a_reg, 0);
6957 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
6958 input_bank_a_reg, 2);
6959 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
6960 input_bank_b_reg, 0);
6961 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
6962 input_bank_a_reg, 2);
6963 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
6964 input_bank_b_reg, 0);
6965 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
6966 input_bank_b_reg, 2);
6967 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
6968 input_bank_b_reg, 0);
6969 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
6970 input_bank_b_reg, 2);
6971 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
6972 input_bank_c_reg, 0);
6973 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
6974 input_bank_b_reg, 2);
6975 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
6976 input_bank_c_reg, 0);
6977 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
6978 input_bank_c_reg, 2);
6979
6980 // Fixed-point multiplication.
6981 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6982 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6983 acc0, output_shift);
6984 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6985 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6986 acc1, output_shift);
6987 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6988 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6989 acc2, output_shift);
6990 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6991 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6992 acc3, output_shift);
6993 // Add the output offset.
6994 int16x8_t acc_s16_0_1 =
6995 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6996 int16x8_t acc_s16_2_3 =
6997 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6998 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6999 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7000 // Apply the activation function.
7001 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7002 vqmovxn_s16(acc_s16_2_3));
7003 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7004 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7005
7006 vst1q_lane_8x4(output_data, acc_u8_all, 0);
7007 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7008 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7009 2);
7010 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7011 3);
7012
7013 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7014 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7015 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7016
7017 output_data += output_depth;
7018 }
7019
7020 {
7021 acc0 = adjusted_bias_data;
7022 acc1 = adjusted_bias_data;
7023 acc2 = adjusted_bias_data;
7024 acc3 = adjusted_bias_data;
7025
7026 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7027 0);
7028 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7029 2);
7030 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7031 0);
7032 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7033 2);
7034 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7035 0);
7036 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7037 2);
7038 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7039 0);
7040 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7041 2);
7042 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7043 0);
7044 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7045 2);
7046 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7047 0);
7048 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7049 2);
7050
7051 // Fixed-point multiplication.
7052 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7053 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7054 acc0, output_shift);
7055 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7056 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7057 acc1, output_shift);
7058 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7059 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7060 acc2, output_shift);
7061 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7062 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7063 acc3, output_shift);
7064 // Add the output offset.
7065 int16x8_t acc_s16_0_1 =
7066 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7067 int16x8_t acc_s16_2_3 =
7068 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7069 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7070 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7071 // Apply the activation function.
7072 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7073 vqmovxn_s16(acc_s16_2_3));
7074 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7075 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7076
7077 vst1q_lane_8x4(output_data, acc_u8_all, 0);
7078 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7079 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7080 2);
7081 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7082 3);
7083
7084 output_data += output_depth;
7085 }
7086
7087 {
7088 acc0 = adjusted_bias_data;
7089 acc1 = adjusted_bias_data;
7090 acc2 = adjusted_bias_data;
7091 acc3 = adjusted_bias_data;
7092
7093 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
7094 input_bank_a_reg, 0);
7095 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
7096 input_bank_a_reg, 2);
7097 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
7098 input_bank_b_reg, 0);
7099 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
7100 input_bank_a_reg, 2);
7101 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
7102 input_bank_b_reg, 0);
7103 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
7104 input_bank_b_reg, 2);
7105 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
7106 input_bank_b_reg, 0);
7107 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
7108 input_bank_b_reg, 2);
7109 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
7110 input_bank_c_reg, 0);
7111 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
7112 input_bank_b_reg, 2);
7113 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
7114 input_bank_c_reg, 0);
7115 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
7116 input_bank_c_reg, 2);
7117
7118 // Fixed-point multiplication.
7119 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7120 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7121 acc0, output_shift);
7122 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7123 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7124 acc1, output_shift);
7125 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7126 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7127 acc2, output_shift);
7128 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7129 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7130 acc3, output_shift);
7131 // Add the output offset.
7132 int16x8_t acc_s16_0_1 =
7133 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7134 int16x8_t acc_s16_2_3 =
7135 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7136 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7137 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7138 // Apply the activation function.
7139 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7140 vqmovxn_s16(acc_s16_2_3));
7141 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7142 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7143
7144 vst1q_lane_8x4(output_data, acc_u8_all, 0);
7145 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7146 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7147 2);
7148 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7149 3);
7150
7151 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7152 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7153 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7154
7155 output_data += output_depth;
7156 acc0 = adjusted_bias_data;
7157 acc1 = adjusted_bias_data;
7158 acc2 = adjusted_bias_data;
7159 acc3 = adjusted_bias_data;
7160
7161 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7162 0);
7163 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7164 0);
7165 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7166 0);
7167 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7168 2);
7169 }
7170 }
7171
7172 if (i_width < output_width_overall_micro_repeats) {
7173 next_input_data += 4;
7174 const int output_width = residual_width;
7175
7176 // Load next sub-micro block of data.
7177 input_bank_a_reg =
7178 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
7179 input_bank_a_reg = vld1q_lane_8x4(
7180 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
7181 input_bank_b_reg =
7182 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7183 input_bank_b_reg, 1);
7184 input_bank_b_reg =
7185 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
7186 input_bank_b_reg, 3);
7187 input_bank_c_reg =
7188 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
7189 input_bank_c_reg, 1);
7190 input_bank_c_reg =
7191 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
7192 input_bank_c_reg, 3);
7193
7194 // Iterate over input width shifts within 4x4 blocks.
7195 for (int x = 0; x < output_width; ++x) {
7196 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7197 0);
7198 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7199 2);
7200 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7201 2);
7202 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7203 2);
7204 acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7205 2);
7206 acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7207 0);
7208 acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7209 0);
7210 acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7211 2);
7212
7213 // Fixed-point multiplication.
7214 acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7215 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7216 acc0, output_shift);
7217 acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7218 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7219 acc1, output_shift);
7220 acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7221 acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7222 acc2, output_shift);
7223 acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7224 acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7225 acc3, output_shift);
7226 // Add the output offset.
7227 int16x8_t acc_s16_0_1 =
7228 vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7229 int16x8_t acc_s16_2_3 =
7230 vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7231 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7232 acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7233 // Apply the activation function.
7234 int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7235 vqmovxn_s16(acc_s16_2_3));
7236 acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7237 acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7238
7239 vst1q_lane_8x4(output_data, acc_u8_all, 0);
7240 vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7241 vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7242 2);
7243 vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7244 3);
7245
7246 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
7247 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
7248 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);
7249
7250 output_data += output_depth;
7251
7252 acc0 = adjusted_bias_data;
7253 acc1 = adjusted_bias_data;
7254 acc2 = adjusted_bias_data;
7255 acc3 = adjusted_bias_data;
7256
7257 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7258 0);
7259 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7260 0);
7261 acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7262 0);
7263 acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7264 2);
7265 }
7266 }
7267 // scratch_block_data += 4 * workspace_height_stride;
7268 output_data_base += 4 * output_height_stride;
7269
7270 // Move to next sub-block: advance to second set of filters, to new
7271 // bias.
7272 filter_reg_0_a = filter_reg_0_b;
7273 filter_reg_1_a = filter_reg_1_b;
7274 filter_reg_2_a = filter_reg_2_b;
7275 filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
7276 filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
7277 filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
7278 }
7279 } else {
7280 // Block height < 4.
7281 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7282 output_data_base = output_data_depthwise;
7283
7284 const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
7285 bias_data += kBiasIncrement;
7286 const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
7287 bias_data += kBiasIncrement;
7288
7289 const int32x4_t output_shift_a =
7290 vld1q_s32(output_shift_per_channel + j_depth * 8);
7291 const int32x4_t output_multiplier_a =
7292 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7293 const int32x4_t output_shift_b =
7294 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7295 const int32x4_t output_multiplier_b =
7296 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7297
7298 for (int k_height = 0; k_height < block_height; ++k_height) {
7299 const int8* next_input_data =
7300 scratch_block_data + k_height * workspace_height_stride;
7301 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7302 output_data = output_data_base;
7303
7304 int8x16_t input_bank_p_reg; // left 0, right 0, left 1, right 1.
7305 int8x16_t input_bank_q_reg; // left 2, right 2, left 3, right 3.
7306
7307 // Load first sub-micro block of data into operational banks.
7308 input_bank_p_reg =
7309 vld1q_dup_s8x4(next_input_data); // Load lane 0, avoiding
7310 // uninitialized variable.
7311 input_bank_p_reg = vld1q_lane_8x4(
7312 next_input_data + workspace_height_stride, input_bank_p_reg, 2);
7313 input_bank_q_reg = vld1q_dup_s8x4(
7314 next_input_data +
7315 2 * workspace_height_stride); // Load lane 0, avoiding
7316 // uninitialized variable.
7317
7318 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7319 ++i_width) {
7320 next_input_data += 4;
7321 const int output_width =
7322 i_width == output_width_micro_repeats ? residual_width : 4;
7323
7324 // Load next sub-micro block of data.
7325 input_bank_p_reg =
7326 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
7327 input_bank_p_reg = vld1q_lane_8x4(
7328 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
7329 input_bank_q_reg =
7330 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7331 input_bank_q_reg, 1);
7332 // Iterate over input width shifts within 4x4 blocks.
7333 for (int x = 0; x < output_width; ++x) {
7334 int32x4_t acc_a = adjusted_bias_data_a;
7335 int32x4_t acc_b = adjusted_bias_data_b;
7336 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
7337 input_bank_p_reg, 0);
7338 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
7339 input_bank_p_reg, 2);
7340 acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
7341 input_bank_q_reg, 0);
7342 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
7343 input_bank_p_reg, 0);
7344 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
7345 input_bank_p_reg, 2);
7346 acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
7347 input_bank_q_reg, 0);
7348
7349 // Fixed-point multiplication.
7350 acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
7351 acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
7352 acc_a =
7353 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7354 acc_a, output_shift_a);
7355 acc_b =
7356 DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7357 acc_b, output_shift_b);
7358 // Add the output offset.
7359 int16x8_t acc_s16_0_0 =
7360 vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
7361 acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
7362 // Apply the activation function.
7363 int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
7364 acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
7365 vget_low_s8(output_activation_min_vec));
7366 acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
7367 vget_low_s8(output_activation_max_vec));
7368
7369 vst1_s8(output_data, acc_u8_0_0);
7370
7371 input_bank_p_reg = vshrq_n_u64(input_bank_p_reg, 8);
7372 input_bank_q_reg = vshrq_n_u64(input_bank_q_reg, 8);
7373
7374 output_data += output_depth;
7375 }
7376 }
7377 output_data_base += output_height_stride;
7378 }
7379 }
7380 output_data_depthwise += 8;
7381 }
7382 } // NOLINT(readability/fn_size) Manually unrolled.
7383
7384 static inline void Run(const int8* scratch_block_data,
7385 const int8* filter_workspace, const int32* bias_data,
7386 int8* output_block_data,
7387 const DepthwiseConvDotProdParams* function_params) {
7388 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
7389 output_block_data, function_params);
7390 }
7391 };
7392
7393 template <>
7394 struct KernelMacroBlock<
7395 DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
7396 QuantizationType::kPerChannelInt8,
7397 DepthwiseConvDepthMultiplication::kUnitInputDepth,
7398 /*stride=*/2> {
7399 static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
7400 static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
7401 return vmin_s8(a, b);
7402 }
7403 static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
7404 return vmax_s8(a, b);
7405 }
7406
7407 static inline void KernelMacroBlockIntrinsics(
7408 const int8* scratch_block_data, const int8* filter_workspace,
7409 const int32* bias_data, int8* output_block_data,
7410 const DepthwiseConvDotProdParams* function_params) {
7411 static constexpr QuantizationType quantization_type =
7412 QuantizationType::kPerChannelInt8;
7413
7414 const int workspace_height_stride =
7415 function_params->workspace_height_stride;
7416 const int output_width_micro_repeats =
7417 function_params->output_width_micro_repeats;
7418 const int depth_micro_repeats = function_params->depth_micro_repeats;
7419 const int output_depth = function_params->output_depth;
7420 constexpr int kStrideVal = 2;
7421 TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
7422
7423 const int output_width_overall_micro_repeats =
7424 function_params->output_width_overall_micro_repeats;
7425 const int block_height = function_params->outbound_block_height;
7426 const int residual_width = function_params->output_residual_width;
7427 const int output_height_stride = function_params->output_height_stride;
7428 constexpr int kBiasIncrement = 4;
7429
7430 const int32 output_activation_min =
7431 function_params->quantized_activation_min;
7432 const int32 output_activation_max =
7433 function_params->quantized_activation_max;
7434 const int32 output_offset = function_params->output_offset;
7435 const int32* output_shift_per_channel =
7436 function_params->output_shift_per_channel;
7437 const int32* output_multiplier_per_channel =
7438 function_params->output_multiplier_per_channel;
7439 if (quantization_type == QuantizationType::kNonPerChannelUint8) {
7440 TFLITE_DCHECK_GE(output_activation_min, 0);
7441 TFLITE_DCHECK_LT(output_activation_min, 256);
7442 TFLITE_DCHECK_GE(output_activation_max, 0);
7443 TFLITE_DCHECK_LT(output_activation_max, 256);
7444 } else {
7445 TFLITE_DCHECK_GE(output_activation_min, -128);
7446 TFLITE_DCHECK_LT(output_activation_min, 128);
7447 TFLITE_DCHECK_GE(output_activation_max, -128);
7448 TFLITE_DCHECK_LT(output_activation_max, 128);
7449 TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
7450 TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
7451 }
7452 TFLITE_DCHECK_GE(output_offset, -32878);
7453 TFLITE_DCHECK_LT(output_offset, 32768);
7454
7455 TFLITE_DCHECK_GE(depth_micro_repeats, 1);
7456
7457 const int16x8_t output_offset_vec =
7458 vdupq_n_s16(static_cast<int16>(output_offset));
7459 const int8x16_t output_activation_min_vec =
7460 vdupq_n_s8(static_cast<int8>(output_activation_min));
7461 const int8x16_t output_activation_max_vec =
7462 vdupq_n_s8(static_cast<int8>(output_activation_max));
7463
7464 for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
7465 int8x16_t filter_reg_0_a;
7466 int8x16_t filter_reg_0_b;
7467 int8x16_t filter_reg_1_a;
7468 int8x16_t filter_reg_1_b;
7469 int8x16_t filter_reg_2_a;
7470 int8x16_t filter_reg_2_b;
7471
7472 filter_reg_0_a = vld1q_s8(filter_workspace);
7473 filter_workspace += 16;
7474 filter_reg_0_b = vld1q_s8(filter_workspace);
7475 filter_workspace += 16;
7476 filter_reg_1_a = vld1q_s8(filter_workspace);
7477 filter_workspace += 16;
7478 filter_reg_1_b = vld1q_s8(filter_workspace);
7479 filter_workspace += 16;
7480 filter_reg_2_a = vld1q_s8(filter_workspace);
7481 filter_workspace += 16;
7482 filter_reg_2_b = vld1q_s8(filter_workspace);
7483 filter_workspace += 16;
7484
7485 const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
7486 bias_data += kBiasIncrement;
7487 const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
7488 bias_data += kBiasIncrement;
7489
7490 const int32x4_t output_shift_s_0 =
7491 vld1q_s32(output_shift_per_channel + j_depth * 8);
7492 const int32x4_t output_multiplier_s_0 =
7493 vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7494 const int32x4_t output_shift_s_1 =
7495 vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7496 const int32x4_t output_multiplier_s_1 =
7497 vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7498
7499 if (block_height == 2) {
7500 const int8* scratch_data = scratch_block_data;
7501 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7502 output_data = output_block_data + 8 * j_depth;
7503
7504 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
7505 int8x16_t input_bank_b_reg; // left 2, right 2, left 3, right 3.
7506 int8x16_t input_bank_c_reg; // left 4, right 4, xxx, xxx.
7507
7508 // Load first sub-micro block of data into operational banks.
7509 input_bank_a_reg =
7510 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
7511 // uninitialized variable.
7512 input_bank_a_reg = vld1q_lane_8x4(
7513 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7514 input_bank_b_reg = vld1q_dup_s8x4(
7515 scratch_data +
7516 2 * workspace_height_stride); // Load lane 0, avoiding
7517 // uninitialized variable.
7518 input_bank_b_reg = vld1q_lane_8x4(
7519 scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
7520 input_bank_c_reg = vld1q_dup_s8x4(
7521 scratch_data +
7522 4 * workspace_height_stride); // Load lane 0, avoiding
7523 // uninitialized variable.
7524
7525 int32x4_t acc0;
7526 int32x4_t acc1;
7527
7528 // When output_width_micro_repeats < output_width_overall_micro_repeats,
7529 // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
7530 // residual_width < 2.
7531 const int adjusted_width_micro_repeats =
7532 (output_width_micro_repeats < output_width_overall_micro_repeats) &&
7533 (residual_width < 2)
7534 ? output_width_micro_repeats
7535 : output_width_overall_micro_repeats;
7536
7537 int i_width = 0;
7538 for (; i_width < adjusted_width_micro_repeats; ++i_width) {
7539 const int8* input_data = scratch_data + 4 + 4 * i_width;
7540
7541 // Load next sub-micro block of data.
7542 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7543 input_bank_a_reg = vld1q_lane_8x4(
7544 input_data + workspace_height_stride, input_bank_a_reg, 3);
7545 input_bank_b_reg = vld1q_lane_8x4(
7546 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7547 input_bank_b_reg = vld1q_lane_8x4(
7548 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7549 input_bank_c_reg = vld1q_lane_8x4(
7550 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7551
7552 int16x8_t acc_s16_0_1;
7553 int8x8_t acc_u8_0_1;
7554 // Iterate over input width shifts within 4x4 blocks.
7555 {
7556 acc0 = adjusted_bias_data_s_0;
7557 acc1 = adjusted_bias_data_s_0;
7558
7559 acc0 =
7560 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7561 acc0 =
7562 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7563 acc0 =
7564 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7565 acc1 =
7566 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7567 acc1 =
7568 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7569 acc1 =
7570 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7571
7572 // Fixed-point multiplication.
7573 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7574 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7575 acc0, output_shift_s_0);
7576 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7577 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7578 acc1, output_shift_s_0);
7579 // Add the output offset.
7580 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7581 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7582 // Apply the activation function.
7583 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7584 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7585 vget_low_s8(output_activation_min_vec));
7586 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7587 vget_low_s8(output_activation_max_vec));
7588
7589 vst1_lane_8x4(output_data, acc_u8_0_1, 0);
7590 vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7591
7592 acc0 = adjusted_bias_data_s_1;
7593 acc1 = adjusted_bias_data_s_1;
7594
7595 acc0 =
7596 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7597 acc0 =
7598 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7599 acc0 =
7600 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7601 acc1 =
7602 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7603 acc1 =
7604 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7605 acc1 =
7606 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7607
7608 // Fixed-point multiplication.
7609 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7610 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7611 acc0, output_shift_s_1);
7612 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7613 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7614 acc1, output_shift_s_1);
7615 // Add the output offset.
7616 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7617 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7618 // Apply the activation function.
7619 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7620 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7621 vget_low_s8(output_activation_min_vec));
7622 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7623 vget_low_s8(output_activation_max_vec));
7624
7625 vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
7626 vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7627 1);
7628
7629 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7630 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7631 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7632
7633 output_data += output_depth;
7634 }
7635
7636 // output_width == four_over_stride.
7637 acc0 = adjusted_bias_data_s_0;
7638 acc1 = adjusted_bias_data_s_0;
7639
7640 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7641 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7642 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7643 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7644 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7645 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7646
7647 // Fixed-point multiplication.
7648 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7649 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7650 acc0, output_shift_s_0);
7651 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7652 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7653 acc1, output_shift_s_0);
7654 // Add the output offset.
7655 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7656 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7657 // Apply the activation function.
7658 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7659 acc_u8_0_1 =
7660 util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7661 acc_u8_0_1 =
7662 util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7663
7664 vst1_lane_8x4(output_data, acc_u8_0_1, 0);
7665 vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7666
7667 acc0 = adjusted_bias_data_s_1;
7668 acc1 = adjusted_bias_data_s_1;
7669
7670 acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7671 acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7672 acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7673 acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7674 acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7675 acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7676
7677 // Fixed-point multiplication.
7678 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7679 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7680 acc0, output_shift_s_1);
7681 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7682 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7683 acc1, output_shift_s_1);
7684 // Add the output offset.
7685 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7686 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7687 // Apply the activation function.
7688 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7689 acc_u8_0_1 =
7690 util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7691 acc_u8_0_1 =
7692 util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7693
7694 vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
7695 vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
7696
7697 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7698 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7699 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7700
7701 output_data += output_depth;
7702 }
7703 for (; i_width < output_width_overall_micro_repeats; ++i_width) {
7704 // output_width == 1.
7705 const int8* input_data = scratch_data + 4 + 4 * i_width;
7706
7707 // Load next sub-micro block of data.
7708 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7709 input_bank_a_reg = vld1q_lane_8x4(
7710 input_data + workspace_height_stride, input_bank_a_reg, 3);
7711 input_bank_b_reg = vld1q_lane_8x4(
7712 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7713 input_bank_b_reg = vld1q_lane_8x4(
7714 input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7715 input_bank_c_reg = vld1q_lane_8x4(
7716 input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7717
7718 int16x8_t acc_s16_0_1;
7719 int8x8_t acc_u8_0_1;
7720 // Iterate over input width shifts within 4x4 blocks.
7721 {
7722 acc0 = adjusted_bias_data_s_0;
7723 acc1 = adjusted_bias_data_s_0;
7724
7725 acc0 =
7726 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7727 acc0 =
7728 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7729 acc0 =
7730 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7731 acc1 =
7732 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7733 acc1 =
7734 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7735 acc1 =
7736 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7737
7738 // Fixed-point multiplication.
7739 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7740 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7741 acc0, output_shift_s_0);
7742 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7743 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7744 acc1, output_shift_s_0);
7745 // Add the output offset.
7746 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7747 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7748 // Apply the activation function.
7749 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7750 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7751 vget_low_s8(output_activation_min_vec));
7752 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7753 vget_low_s8(output_activation_max_vec));
7754
7755 vst1_lane_8x4(output_data, acc_u8_0_1, 0);
7756 vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7757
7758 acc0 = adjusted_bias_data_s_1;
7759 acc1 = adjusted_bias_data_s_1;
7760
7761 acc0 =
7762 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7763 acc0 =
7764 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7765 acc0 =
7766 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7767 acc1 =
7768 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7769 acc1 =
7770 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7771 acc1 =
7772 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7773
7774 // Fixed-point multiplication.
7775 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7776 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7777 acc0, output_shift_s_1);
7778 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7779 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7780 acc1, output_shift_s_1);
7781 // Add the output offset.
7782 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7783 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7784 // Apply the activation function.
7785 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7786 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7787 vget_low_s8(output_activation_min_vec));
7788 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7789 vget_low_s8(output_activation_max_vec));
7790
7791 vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
7792 vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7793 1);
7794
7795 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7796 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7797 input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7798
7799 output_data += output_depth;
7800 }
7801 }
7802 } else {
7803 TFLITE_DCHECK_EQ(block_height, 1);
7804 // Work through one slice, by row, at a time.
7805 const int8* scratch_data = scratch_block_data;
7806 typename QuantizationTypeImpl<quantization_type>::ExternalType*
7807 output_data = output_block_data + 8 * j_depth;
7808
7809 int8x16_t input_bank_a_reg; // left 0, right 0, left 1, right 1.
7810 int8x16_t input_bank_b_reg; // left 2, right 2, xxx, xxx.
7811
7812 // Load first sub-micro block of data into operational banks.
7813 input_bank_a_reg =
7814 vld1q_dup_s8x4(scratch_data); // Load lane 0, avoiding
7815 // uninitialized variable.
7816 input_bank_a_reg = vld1q_lane_8x4(
7817 scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7818 input_bank_b_reg = vld1q_dup_s8x4(
7819 scratch_data +
7820 2 * workspace_height_stride); // Load lane 0, avoiding
7821 // uninitialized variable.
7822
7823 int32x4_t acc0;
7824 int32x4_t acc1;
7825
7826 for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7827 ++i_width) {
7828 const int output_width =
7829 i_width == output_width_micro_repeats ? residual_width : 2;
7830
7831 TFLITE_DCHECK_LE(output_width, 2);
7832 TFLITE_DCHECK_GE(output_width, 1);
7833 TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
7834 const int8* input_data = scratch_data + 4 + 4 * i_width;
7835
7836 // Load next sub-micro block of data.
7837 input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7838 input_bank_a_reg = vld1q_lane_8x4(
7839 input_data + workspace_height_stride, input_bank_a_reg, 3);
7840 input_bank_b_reg = vld1q_lane_8x4(
7841 input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7842
7843 int16x8_t acc_s16_0_1;
7844 int8x8_t acc_u8_0_1;
7845
7846 // Iterate over input width shifts within 4x4 blocks.
7847 {
7848 acc0 = adjusted_bias_data_s_0;
7849
7850 acc0 =
7851 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7852 acc0 =
7853 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7854 acc0 =
7855 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7856
7857 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7858 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7859 acc0, output_shift_s_0);
7860
7861 // Second sub-block accumulation.
7862 acc1 = adjusted_bias_data_s_1;
7863
7864 acc1 =
7865 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
7866 acc1 =
7867 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
7868 acc1 =
7869 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
7870
7871 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7872 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7873 acc1, output_shift_s_1);
7874
7875 // Add the output offset.
7876 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7877 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7878 // Apply the activation function.
7879 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7880 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7881 vget_low_s8(output_activation_min_vec));
7882 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7883 vget_low_s8(output_activation_max_vec));
7884
7885 // This stores the results for both sub-blocks together.
7886 vst1_s8(output_data, acc_u8_0_1);
7887
7888 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7889 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7890
7891 output_data += output_depth;
7892 }
7893 if (output_width == 2) {
7894 acc0 = adjusted_bias_data_s_0;
7895
7896 acc0 =
7897 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7898 acc0 =
7899 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7900 acc0 =
7901 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7902
7903 acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7904 acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7905 acc0, output_shift_s_0);
7906
7907 // Second sub-block accumulation.
7908 acc1 = adjusted_bias_data_s_1;
7909
7910 acc1 =
7911 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
7912 acc1 =
7913 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
7914 acc1 =
7915 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
7916
7917 acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7918 acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7919 acc1, output_shift_s_1);
7920
7921 // Add the output offset.
7922 acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7923 acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7924 // Apply the activation function.
7925 acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7926 acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7927 vget_low_s8(output_activation_min_vec));
7928 acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7929 vget_low_s8(output_activation_max_vec));
7930
7931 // This stores the results for both sub-blocks together.
7932 vst1_s8(output_data, acc_u8_0_1);
7933
7934 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7935 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7936
7937 output_data += output_depth;
7938 }
7939 }
7940 }
7941 }
7942 }
7943
7944 static inline void Run(const int8* scratch_block_data,
7945 const int8* filter_workspace, const int32* bias_data,
7946 int8* output_block_data,
7947 const DepthwiseConvDotProdParams* function_params) {
7948 KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
7949 output_block_data, function_params);
7950 }
7951 };
7952
7953 #undef vst1_lane_8x4
7954 #undef vst1q_lane_8x4
7955 #undef vld1q_lane_s8x8
7956 #undef vld1_lane_8x4
7957 #undef vld1q_lane_8x4
7958 #undef vld1q_dup_s8x4
7959
7960 #endif // USE_NEON
7961
7962 } // namespace depthwise_conv
7963 } // namespace optimized_ops
7964 } // namespace tflite
7965
7966 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
7967