• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
17 
18 // This file provides kernel implementations that are not used in shipped
19 // inference code, but rather (a) show how model C++ code is designed and then
20 // transformed into asm code, and (b) aid with maintenance and later development
21 // of variations. Many projects (even including, say, the classic NAG libraries)
22 // develop highly optimized code, but do not maintain intermediate versions.
23 // Often the result is incomprehensible final-version code.
24 
25 #include <algorithm>
26 
27 #include "tensorflow/lite/kernels/internal/compatibility.h"
28 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
29 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
30 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
31 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
32 #include "tensorflow/lite/kernels/internal/types.h"
33 
34 namespace tflite {
35 namespace optimized_ops {
36 namespace depthwise_conv {
37 
38 #ifdef USE_NEON
39 
util_vst1_x8(uint8 * data_addr,int8x8_t reg)40 inline void util_vst1_x8(uint8* data_addr, int8x8_t reg) {
41   return vst1_u8(data_addr, vreinterpret_u8_s8(reg));
42 }
util_vst1_x8(int8 * data_addr,int8x8_t reg)43 inline void util_vst1_x8(int8* data_addr, int8x8_t reg) {
44   return vst1_s8(data_addr, reg);
45 }
46 
47 // Lane operations are for clarity and convenience. We want to load and store
48 // 4 8-bit lanes together. So these are treated much like 32-bit loads and
49 // 32-bit stores. Stores require 32-bit alignment.
50 
51 #define vst1_lane_8x4(dst, reg, lane_num)                         \
52   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
53   vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
54 #define vst1q_lane_8x4(dst, reg, lane_num)                        \
55   TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
56   vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
57 
58 // Important! Most compilation configurations will compile and run without
59 // reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
60 // obscure bug or mis-feature probably in unhygienic macro expansion.
61 #define vld1q_lane_s8x8(src, reg, lane_num) \
62   vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
63 #define vld1_lane_8x4(src, reg, lane_num) \
64   vld1_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
65 #define vld1q_lane_8x4(src, reg, lane_num) \
66   vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
67 #define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
68 
69 #endif  // USE_NEON
70 
71 template <QuantizationType quantization_type>
72 struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
73                        quantization_type> {
74   // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
75   // width 3,  sub-block 0 or 1, depth 4. Filter data is written as
76   // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
77   //
78   // Note that this rearrangement is much like that performed on input data when
79   // filling the workspace, and optimized versions will be similar.
80   static inline void FillFilterBank(int depth, const uint8* filter_block,
81                                     int8 filter_bank[3][2][4][4]) {
82     constexpr int kSymmetricZeroPoint =
83         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
84     // Load filter data in, 8-bytes down depth / sub-block at a time.
85     //
86     // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
87     // depth 4.
88     uint8 loaded_filter[3][4][2][4];
89     for (int y = 0; y < 3; ++y) {
90       for (int x = 0; x < 3; ++x) {
91         memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
92                8);
93       }
94       // Pad the filter with symmetric representation of 0, so that the values
95       // become 0 when the zero-poing is added below. Thus these filter taps are
96       // effectively disregarded in later filtering.
97       memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
98     }
99     for (int y = 0; y < 3; ++y) {
100       for (int z = 0; z < 4; ++z) {
101         for (int x = 0; x < 4; ++x) {
102           filter_bank[y][0][z][x] =
103               loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
104           filter_bank[y][1][z][x] =
105               loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
106         }
107       }
108     }
109   }
110 
111   // Adjust the bias (weights) data according to the input offset.
112   //
113   // The output calculation is
114   // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
115   //                                 (filter[i][j][d] + filter_offset)
116   // (where offsets are expressed as differences from 128).
117   //
118   // Since we cannot efficiently handle varying offsets / bias across the image,
119   // we insist on filter_offset = 0.
120   //
121   // This function calculates
122   // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
123   // which accounts for input offset. If the bias is constant over the depth,
124   // the adjusted bias will vary.
125   static inline void AdjustBias(int32 input_offset,
126                                 const int8 filter_bank[3][2][4][4],
127                                 const int32* bias_data,
128                                 int32 adjusted_bias_block[2][4]) {
129     constexpr int kSymmetricZeroPoint =
130         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
131     TFLITE_DCHECK_GE(input_offset, -255);
132     TFLITE_DCHECK_LE(input_offset, 0);
133     // For instance, if input_offset == 128, no adjustment is needed.
134     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
135 
136     for (int s = 0; s < 2; ++s) {
137       for (int z = 0; z < 4; ++z) {
138         adjusted_bias_block[s][z] = bias_data[4 * s + z];
139         for (int i = 0; i < 9; ++i) {
140           adjusted_bias_block[s][z] +=
141               input_offset_difference * filter_bank[i % 3][s][z][i / 3];
142         }
143       }
144     }
145   }
146 
147   static void Run(const uint8* filter_data, const int32* bias_data,
148                   int8* shuffled_filter_data, int32* adjusted_bias_data,
149                   const DepthwiseConvDotProdParams* function_params) {
150     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
151     const int depth = function_params->output_depth;
152     const int depth_micro_repeats = function_params->depth_micro_repeats;
153     const int bias_increment = function_params->bias_increment;
154     const int32 input_offset = function_params->input_offset;
155 
156     int8 filter_bank[3][2][4][4];
157     int32 adjusted_bias_block[2][4];
158 
159     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
160       FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
161       AdjustBias(input_offset, filter_bank,
162                  bias_data + 2 * bias_increment * j_depth, adjusted_bias_block);
163 
164       memcpy(shuffled_filter_data, filter_bank[0][0][0],
165              shuffled_filter_increment);
166       shuffled_filter_data += shuffled_filter_increment;
167       memcpy(adjusted_bias_data, adjusted_bias_block[0],
168              8 * sizeof(adjusted_bias_block[0][0]));
169       adjusted_bias_data += 8;
170     }
171   }
172 };
173 
174 template <QuantizationType quantization_type>
175 struct ProcessPerDepth<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
176                        quantization_type> {
177   static inline void Run(const uint8* filter_data, const int32* bias_data,
178                          int8* shuffled_filter_data, int32* adjusted_bias_data,
179                          const DepthwiseConvDotProdParams* function_params) {
180     const int depth = function_params->output_depth;
181     const int depth_micro_repeats = function_params->depth_micro_repeats;
182     const int bias_increment = function_params->bias_increment;
183 
184     // Simulate NEON-register transposition of subset of filter.
185     int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
186     int8 filter_bank_a_1[4][4];
187     int8 filter_bank_a_2[4][4];
188     int8 filter_bank_b_0[4][4];
189     int8 filter_bank_b_1[4][4];
190     int8 filter_bank_b_2[4][4];
191 
192     // Load filter data in, essentially dropping the [depth/8] dimension, which
193     // is equivalent to loading just the depth needed for one micro-block.
194     //
195     // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
196     // depth 4.
197     uint8 loaded_filter_0[4][2][4];
198     uint8 loaded_filter_1[4][2][4];
199     uint8 loaded_filter_2[4][2][4];
200 
201     constexpr int kSymmetricZeroPoint =
202         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
203     const int32 input_offset = function_params->input_offset;
204     TFLITE_DCHECK_GE(input_offset, -255);
205     TFLITE_DCHECK_LE(input_offset, 0);
206     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
207 
208     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
209       const uint8* filter_block = filter_data + 8 * j_depth;
210 
211       // Filter data is provided as filter_block[3][3][depth/8][2][4].
212       // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
213       // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
214       for (int x = 0; x < 3; ++x) {
215         memcpy(loaded_filter_0[x][0], &filter_block[3 * 0 * depth + x * depth],
216                8);
217         memcpy(loaded_filter_1[x][0], &filter_block[3 * 1 * depth + x * depth],
218                8);
219         memcpy(loaded_filter_2[x][0], &filter_block[3 * 2 * depth + x * depth],
220                8);
221       }
222       // Pad the filter with -filter_offset, so that the values become 0 when
223       // the filter_offset is later added, and so the filter tap is effectively
224       // disregarded.
225       memset(loaded_filter_0[3][0], kSymmetricZeroPoint, 8);
226       memset(loaded_filter_1[3][0], kSymmetricZeroPoint, 8);
227       memset(loaded_filter_2[3][0], kSymmetricZeroPoint, 8);
228 
229       for (int z = 0; z < 4; ++z) {
230         for (int x = 0; x < 4; ++x) {
231           filter_bank_a_0[z][x] =
232               loaded_filter_0[x][0][z] - kSymmetricZeroPoint;
233           filter_bank_b_0[z][x] =
234               loaded_filter_0[x][1][z] - kSymmetricZeroPoint;
235           filter_bank_a_1[z][x] =
236               loaded_filter_1[x][0][z] - kSymmetricZeroPoint;
237           filter_bank_b_1[z][x] =
238               loaded_filter_1[x][1][z] - kSymmetricZeroPoint;
239           filter_bank_a_2[z][x] =
240               loaded_filter_2[x][0][z] - kSymmetricZeroPoint;
241           filter_bank_b_2[z][x] =
242               loaded_filter_2[x][1][z] - kSymmetricZeroPoint;
243         }
244       }
245 
246       memcpy(shuffled_filter_data, filter_bank_a_0, 16);
247       shuffled_filter_data += 16;
248       memcpy(shuffled_filter_data, filter_bank_b_0, 16);
249       shuffled_filter_data += 16;
250       memcpy(shuffled_filter_data, filter_bank_a_1, 16);
251       shuffled_filter_data += 16;
252       memcpy(shuffled_filter_data, filter_bank_b_1, 16);
253       shuffled_filter_data += 16;
254       memcpy(shuffled_filter_data, filter_bank_a_2, 16);
255       shuffled_filter_data += 16;
256       memcpy(shuffled_filter_data, filter_bank_b_2, 16);
257       shuffled_filter_data += 16;
258 
259       int32 adjusted_bias_data_0[4];
260       int32 adjusted_bias_data_1[4];
261       // For instance, if input_offset == 128, no adjustment is needed.
262       for (int z = 0; z < 4; ++z) {
263         adjusted_bias_data_0[z] = bias_data[z];
264         adjusted_bias_data_1[z] = bias_data[4 + z];
265         for (int x = 0; x < 4; ++x) {
266           adjusted_bias_data_0[z] +=
267               input_offset_difference * filter_bank_a_0[z][x];
268           adjusted_bias_data_0[z] +=
269               input_offset_difference * filter_bank_a_1[z][x];
270           adjusted_bias_data_0[z] +=
271               input_offset_difference * filter_bank_a_2[z][x];
272           adjusted_bias_data_1[z] +=
273               input_offset_difference * filter_bank_b_0[z][x];
274           adjusted_bias_data_1[z] +=
275               input_offset_difference * filter_bank_b_1[z][x];
276           adjusted_bias_data_1[z] +=
277               input_offset_difference * filter_bank_b_2[z][x];
278 
279           adjusted_bias_data[z] = adjusted_bias_data_0[z];
280           adjusted_bias_data[4 + z] = adjusted_bias_data_1[z];
281         }
282       }
283       bias_data += 2 * bias_increment;
284       adjusted_bias_data += 8;
285     }
286   }
287 };
288 
289 #ifdef USE_NEON
290 template <QuantizationType quantization_type>
291 struct ProcessPerDepth<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
292                        quantization_type> {
293   static void ProcessPerDepthIntrinsics(
294       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
295           filter_data,
296       const int32* bias_data, int8* shuffled_filter_data,
297       int32* adjusted_bias_data,
298       const DepthwiseConvDotProdParams* function_params) {
299     const int depth = function_params->output_depth;
300     const int depth_micro_repeats = function_params->depth_micro_repeats;
301     const int bias_increment = function_params->bias_increment;
302 
303     constexpr int kSymmetricZeroPoint =
304         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
305     constexpr uint8 kSignBit =
306         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
307     const int32 input_offset = function_params->input_offset;
308     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
309       TFLITE_DCHECK_GE(input_offset, -255);
310       TFLITE_DCHECK_LE(input_offset, 0);
311     }
312     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
313     const int8x16_t ones_vector = vdupq_n_s8(1);
314 
315     // Simulate NEON-register transposition of subset of filter.
316     int8x16_t input_0_a;
317     int8x16_t input_0_b;
318     int8x16_t input_0_c;
319     int8x16_t input_1_a;
320     int8x16_t input_1_b;
321     int8x16_t input_1_c;
322     int8x16_t input_2_a;
323     int8x16_t input_2_b;
324     int8x16_t input_2_c;
325 
326     int8x16_t filter_0_a;
327     int8x16_t filter_0_b;
328     int8x16_t filter_1_a;
329     int8x16_t filter_1_b;
330     int8x16_t filter_2_a;
331     int8x16_t filter_2_b;
332 
333     // For uint8, effect subtraction of zero-point = 128 by XOR of sign bit.
334     const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
335 
336     const typename QuantizationTypeImpl<quantization_type>::ExternalType*
337         filter_block = filter_data;
338     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
339       // Filter data is provided as filter_block[3][3][depth/8][2][4].
340       // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
341       // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
342 
343       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
344           filter_block_ptr = filter_block;
345       input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
346       filter_block_ptr += depth;
347       input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
348       filter_block_ptr += depth;
349       input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
350       filter_block_ptr += depth;
351       input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
352       filter_block_ptr += depth;
353       input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
354       filter_block_ptr += depth;
355       input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
356       filter_block_ptr += depth;
357       input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
358       filter_block_ptr += depth;
359       input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
360       filter_block_ptr += depth;
361       input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
362 
363       filter_0_a = vzip1q_s8(input_0_a, input_0_b);
364       filter_0_b = vzip1q_s8(input_0_c, sign_bit);
365       filter_1_a = vzip1q_s8(input_1_a, input_1_b);
366       filter_1_b = vzip1q_s8(input_1_c, sign_bit);
367       filter_2_a = vzip1q_s8(input_2_a, input_2_b);
368       filter_2_b = vzip1q_s8(input_2_c, sign_bit);
369       if (quantization_type == QuantizationType::kNonPerChannelUint8) {
370         filter_0_a = veorq_s8(filter_0_a, sign_bit);
371         filter_0_b = veorq_s8(filter_0_b, sign_bit);
372         filter_1_a = veorq_s8(filter_1_a, sign_bit);
373         filter_1_b = veorq_s8(filter_1_b, sign_bit);
374         filter_2_a = veorq_s8(filter_2_a, sign_bit);
375         filter_2_b = veorq_s8(filter_2_b, sign_bit);
376       }
377       vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
378       vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
379       vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
380 
381       vst1q_s8(shuffled_filter_data, filter_0_a);
382       shuffled_filter_data += 16;
383       vst1q_s8(shuffled_filter_data, filter_0_b);
384       shuffled_filter_data += 16;
385       vst1q_s8(shuffled_filter_data, filter_1_a);
386       shuffled_filter_data += 16;
387       vst1q_s8(shuffled_filter_data, filter_1_b);
388       shuffled_filter_data += 16;
389       vst1q_s8(shuffled_filter_data, filter_2_a);
390       shuffled_filter_data += 16;
391       vst1q_s8(shuffled_filter_data, filter_2_b);
392       shuffled_filter_data += 16;
393 
394       int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
395       bias_data += bias_increment;
396       int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
397       bias_data += bias_increment;
398       // For instance, if input_offset is kIntSymmetricZeroPoint, no adjustment
399       // is needed.
400 
401       int32x4_t filter_sum_a = vdupq_n_s32(0);
402       filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
403       filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
404       filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
405       int32x4_t filter_sum_b = vdupq_n_s32(0);
406       filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
407       filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
408       filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
409 
410       adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
411                                          input_offset_difference);
412       adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
413                                          input_offset_difference);
414 
415       vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
416       adjusted_bias_data += 4;
417       vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
418       adjusted_bias_data += 4;
419 
420       filter_block += 8;
421     }
422   }
423 
424   static inline void Run(const typename QuantizationTypeImpl<
425                              quantization_type>::ExternalType* filter_data,
426                          const int32* bias_data, int8* shuffled_filter_data,
427                          int32* adjusted_bias_data,
428                          const DepthwiseConvDotProdParams* function_params) {
429     ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
430                               adjusted_bias_data, function_params);
431   }
432 };
433 #endif
434 
435 template <QuantizationType quantization_type, int32 max_padding>
436 struct PackMacroBlock<
437     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
438     DepthwiseConvDepthMultiplication::kNoMultiplication, max_padding> {
439   // A straight copy of a macro block of input data into a scratch buffer.
440   //
441   // Requirement: depth_micro_repeats > 0.
442   static inline void CopyMacroBlock(
443       int32 height_block_number, int32 width_block_number,
444       const DepthwiseConvDotProdParams& function_params,
445       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
446           input_block_data,
447       int8* scratch_block_data) {
448     TFLITE_DCHECK_LE(max_padding, 1);
449 
450     // Strides.
451     // The input depth and count of micro blocks provide the width strides.
452     const int input_height_stride = function_params.input_height_stride;
453     const int workspace_height_stride = function_params.workspace_height_stride;
454     const int input_depth = function_params.input_depth;
455     const int depth_micro_repeats = function_params.depth_micro_repeats;
456     TFLITE_DCHECK_GT(depth_micro_repeats, 0);
457 
458     // Remaining iteration and dimension parameters.
459     //
460     // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
461     // final micro block is incomplete.
462     const int width_overall_micro_repeats =
463         function_params.input_width_overall_micro_repeats;
464     int input_width_micro_repeats = function_params.input_width_micro_repeats;
465     const int residual_width = function_params.residual_width;
466     const int block_height = function_params.inbound_block_height;
467 
468     const int padding_left = function_params.padding_left;
469     const int padding_right = function_params.padding_right;
470     const int padding_top = function_params.padding_top;
471     const int padding_bottom = function_params.padding_bottom;
472 
473     const bool leading_width_padding =
474         padding_left > 0 && width_block_number == 0;
475     const bool trailing_width_padding =
476         padding_right > 0 &&
477         width_block_number == (function_params.width_macro_count - 1);
478     const bool leading_height_padding =
479         padding_top > 0 && height_block_number < 0;
480     const bool trailing_height_padding =
481         padding_bottom > 0 &&
482         height_block_number == (function_params.height_macro_count - 1);
483 
484     // Modify the trailing case to reflect the input width.
485     int input_residual_width =
486         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
487                                                                 : 4;
488     if (trailing_width_padding) {
489       input_residual_width -= 1;
490       input_width_micro_repeats = width_overall_micro_repeats - 1;
491     }
492 
493     constexpr int kSymmetricZeroPoint =
494         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
495     const int32 input_offset_difference =
496         function_params.input_offset + kSymmetricZeroPoint;
497 
498     // We load data into a temporary buffer and then save, to match subsequent
499     // processing. This will make it easier to combine stages into one ASM
500     // routine.
501     int8 tmp_load[4][2][4];
502 
503     int copy_block_height = block_height;
504     if (leading_height_padding) {
505       memset(scratch_block_data, -input_offset_difference,
506              workspace_height_stride);
507       scratch_block_data += workspace_height_stride;
508       input_block_data += input_height_stride;
509       copy_block_height -= 1;
510     }
511     if (trailing_height_padding) {
512       copy_block_height -= 1;
513     }
514 
515     // The outer 3 loops go through all the micro blocks in a macro block.
516     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
517       for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
518         // Figure out division of work (available input vs trailing padding).
519         int adjusted_residual_width =
520             j_width == input_width_micro_repeats ? input_residual_width : 4;
521 
522         int start_width = 0;
523         if (leading_width_padding && j_width == 0) {
524           start_width = 1;
525           memset(tmp_load[0][0], -input_offset_difference, 8);
526         }
527         if (adjusted_residual_width < 4) {
528           for (int x = adjusted_residual_width; x < 4; ++x) {
529             memset(tmp_load[x][0], -input_offset_difference, 8);
530           }
531         }
532 
533         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
534           // The inner 3 loops go through the sub-block, depth and width within
535           // each micro block.
536 
537           // Load, and apply symmetric offset.
538           int8* scratch_data =
539               scratch_block_data + k_height * workspace_height_stride +
540               j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
541           const typename QuantizationTypeImpl<quantization_type>::ExternalType*
542               input_data = input_block_data + k_height * input_height_stride +
543                            j_width * 4 * input_depth + i_depth * 8;
544           // Full-size macro blocks are 2*4*4 = 32 bytes.
545           for (int x = start_width; x < adjusted_residual_width; ++x) {
546             for (int s = 0; s < 2; ++s) {
547               for (int d = 0; d < 4; ++d) {
548                 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
549                                     kSymmetricZeroPoint;
550               }
551             }
552           }
553 
554           // Save results.
555           memcpy(&scratch_data[0], tmp_load[0][0], 8);
556           memcpy(&scratch_data[8], tmp_load[1][0], 8);
557           memcpy(&scratch_data[16], tmp_load[2][0], 8);
558           memcpy(&scratch_data[24], tmp_load[3][0], 8);
559         }
560       }
561     }
562 
563     if (trailing_height_padding) {
564       memset(scratch_block_data + copy_block_height * workspace_height_stride,
565              -input_offset_difference, workspace_height_stride);
566     }
567   }
568 
569   // Transpose 4x4 blocks within each sub-micro-block.
570   //
571   // Implemented somewhat like NEON register manipulation, so that we can see
572   // equivalence of the two approaches.
573   static inline void MicroTransposeBlocks(
574       const DepthwiseConvDotProdParams& function_params,
575       int8* scratch_block_data) {
576     const int workspace_height_stride = function_params.workspace_height_stride;
577     const int width_overall_micro_repeats =
578         function_params.input_width_overall_micro_repeats;
579     const int depth_micro_repeats = function_params.depth_micro_repeats;
580     const int block_height = function_params.inbound_block_height;
581 
582     // Transpositions are 4x4, but doing 2 at a time is more efficient in the
583     // NEON code we are simulating.
584     int8 tmp_load[4][2][4];         // [width][sub-block][depth]
585     int8 tmp_transposed[4][2][4];   // [depth][sub-block][width]
586     int8 tmp_interleaved[2][4][4];  // [sub-block][depth][width]
587 
588     // The outer 3 loops go through all the micro blocks in a macro block.
589     for (int k_height = 0; k_height < block_height; ++k_height) {
590       for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
591         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
592           int8* scratch_data =
593               scratch_block_data + k_height * workspace_height_stride +
594               j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
595           // A. Load data
596           memcpy(tmp_load[0][0], &scratch_data[0], 8);
597           memcpy(tmp_load[1][0], &scratch_data[8], 8);
598           memcpy(tmp_load[2][0], &scratch_data[16], 8);
599           memcpy(tmp_load[3][0], &scratch_data[24], 8);
600 
601           // B. Simulate between-register transposition.
602           for (int x = 0; x < 4; ++x) {
603             for (int y = 0; y < 4; ++y) {
604               tmp_transposed[x][0][y] = tmp_load[y][0][x];
605               tmp_transposed[x][1][y] = tmp_load[y][1][x];
606             }
607           }
608 
609           // C. Simulate between-register interleaving.
610           for (int x = 0; x < 4; ++x) {
611             for (int y = 0; y < 4; ++y) {
612               tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
613               tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
614             }
615           }
616           // D. Simulate mangled storage arrangement.
617           memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
618           memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
619         }
620       }
621     }
622   }
623 
624   static inline void Run(
625       int32 height_block_number, int32 width_block_number,
626       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
627           input_block_data,
628       int8* scratch_block_data,
629       const DepthwiseConvDotProdParams* function_params) {
630     CopyMacroBlock(height_block_number, width_block_number, *function_params,
631                    input_block_data, scratch_block_data);
632     MicroTransposeBlocks(*function_params, scratch_block_data);
633   }
634 };
635 
636 template <QuantizationType quantization_type, int32 max_padding>
637 struct PackMacroBlock<
638     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
639     DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
640   static inline void Run(
641       int32 height_block_number, int32 width_block_number,
642       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
643           input_block_data,
644       int8* scratch_block_data,
645       const DepthwiseConvDotProdParams* function_params) {
646     // Currently support for padding is limited to 1 on any side.
647     TFLITE_DCHECK_LE(max_padding, 1);
648 
649     // Strides.
650     // The count of micro blocks (below) provides the width strides.
651     const int input_height_stride = function_params->input_height_stride;
652     const int workspace_height_stride =
653         function_params->workspace_height_stride;
654 
655     // Remaining iteration and dimension parameters.
656     //
657     // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
658     // final micro block is incomplete.
659     const int width_overall_micro_repeats =
660         function_params->input_width_overall_micro_repeats;
661     const int input_width_micro_repeats =
662         function_params->input_width_micro_repeats;
663     const int residual_width = function_params->residual_width;
664     const int block_height = function_params->inbound_block_height;
665     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
666 
667     const int padding_left = function_params->padding_left;
668     const int padding_right = function_params->padding_right;
669     const int padding_top = function_params->padding_top;
670     const int padding_bottom = function_params->padding_bottom;
671 
672     const bool leading_width_padding =
673         padding_left > 0 && width_block_number == 0;
674     const bool trailing_width_padding =
675         padding_right > 0 &&
676         width_block_number == (function_params->width_macro_count - 1);
677     const bool leading_height_padding =
678         padding_top > 0 && height_block_number < 0;
679     const bool trailing_height_padding =
680         padding_bottom > 0 &&
681         height_block_number == (function_params->height_macro_count - 1);
682 
683     constexpr int kSymmetricZeroPoint =
684         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
685     const int32 input_offset_difference =
686         function_params->input_offset + kSymmetricZeroPoint;
687 
688     int copy_block_height = block_height;
689     if (leading_height_padding) {
690       memset(scratch_block_data, -input_offset_difference,
691              workspace_height_stride + kWorkspaceExtension);
692       scratch_block_data += workspace_height_stride;
693       input_block_data += input_height_stride;
694       copy_block_height -= 1;
695     }
696     if (trailing_height_padding) {
697       copy_block_height -= 1;
698     }
699 
700     int adjusted_residual_width =
701         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
702                                                                 : 4;
703 
704     if (trailing_width_padding) {
705       adjusted_residual_width -= 1;
706     }
707     int start_width = 0;
708     if (leading_width_padding) {
709       start_width = 1;
710       input_block_data += 1;
711     }
712 
713     const int copy_size = (width_overall_micro_repeats - 1) * 4 +
714                           adjusted_residual_width - start_width;
715 
716     TFLITE_DCHECK_LE(
717         copy_size,
718         input_height_stride - width_block_number * input_width_micro_repeats);
719     // We may drop up to stride-1 of trailing input.
720     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
721 
722     // When there is unit input depth, the micro-block iteration need only be
723     // through the height. The micro blocks are contiguous across the width.
724     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
725       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
726           input_data = input_block_data + k_height * input_height_stride;
727       int8* scratch_data =
728           scratch_block_data + k_height * workspace_height_stride;
729 
730       // Handle leading padding. This is overwritten if there is no padding.
731       scratch_data[0] = -input_offset_difference;
732 
733       memcpy(&scratch_data[start_width], input_data, copy_size);
734       for (int i = 0; i < copy_size; ++i) {
735         scratch_data[start_width + i] += -kSymmetricZeroPoint;
736       }
737 
738       // Handle trailing padding, and fill in remainder of micro block.
739       memset(&scratch_data[start_width + copy_size], -input_offset_difference,
740              4 - adjusted_residual_width + kWorkspaceExtension);
741     }
742 
743     if (trailing_height_padding) {
744       memset(scratch_block_data + copy_block_height * workspace_height_stride,
745              -input_offset_difference,
746              workspace_height_stride + kWorkspaceExtension);
747     }
748   }
749 };
750 
751 // Beginning of code section containing intermediate code transformation.
752 //
753 // This section is only compiled when kUseUnwound3x3DotProduct versions of
754 // templated functions are selected.
755 template <QuantizationType quantization_type>
756 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
757                       quantization_type,
758                       DepthwiseConvDepthMultiplication::kNoMultiplication,
759                       /*max_padding=*/0> {
760   static inline void Run(
761       int32 height_block_number, int32 width_block_number,
762       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
763           input_block_data,
764       int8* scratch_block_data,
765       const DepthwiseConvDotProdParams* function_params) {
766     const int workspace_height_stride =
767         function_params->workspace_height_stride;
768     const int width_overall_micro_repeats =
769         function_params->input_width_overall_micro_repeats;
770     const int input_width_micro_repeats =
771         function_params->input_width_micro_repeats;
772     const int depth_micro_repeats = function_params->depth_micro_repeats;
773     const int block_height = function_params->inbound_block_height;
774     const int residual_width = function_params->residual_width;
775     const int input_height_stride = function_params->input_height_stride;
776     const int input_depth = function_params->input_depth;
777 
778     TFLITE_DCHECK_GE(depth_micro_repeats, 0);
779     constexpr int kSymmetricZeroPoint =
780         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
781     const int micro_block_size = 4 * 8;
782     const int depth_advance = width_overall_micro_repeats * micro_block_size;
783     const int width_advance =
784         micro_block_size *
785         (1 - depth_micro_repeats * width_overall_micro_repeats);
786     const int height_advance = workspace_height_stride -
787                                width_overall_micro_repeats * micro_block_size;
788     const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
789 
790     // Transpositions are 4x4, but doing 2 at a time is more efficient in the
791     // NEON code we are simulating. Note the blocks of 4x4 are still interleaved
792     // down the depth.
793     int8 tmp_load[4][2][4];
794     int8 tmp_transposed[4][2][4];
795     int8 tmp_interleaved[2][4][4];
796 
797     // Work through one slice, by row, at a time.
798     int8* scratch_data = scratch_block_data;
799     for (int k_height = 0; k_height < block_height; ++k_height) {
800       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
801           input_data = input_block_data;
802       input_block_data += input_height_stride;
803 
804       // Traverse the width one point at a time, but the depth in (micro) blocks
805       // of size 8.
806       //
807       // The depth and width margins, which are filled with "zeros", may be
808       // larger than is strictly needed to calculate output. This is because the
809       // conv calculation is performed across complete micro blocks.
810       for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
811         // Load, then zero.
812         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
813           // A. Simulate register loading.
814           for (int x = 0; x < 4; ++x) {
815             for (int s = 0; s < 2; ++s) {
816               for (int d = 0; d < 4; ++d) {
817                 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
818                                     kSymmetricZeroPoint;
819               }
820             }
821           }
822           // B. Simulate between-register transposition.
823           for (int x = 0; x < 4; ++x) {
824             for (int y = 0; y < 4; ++y) {
825               tmp_transposed[x][0][y] = tmp_load[y][0][x];
826               tmp_transposed[x][1][y] = tmp_load[y][1][x];
827             }
828           }
829 
830           // C and D are to be performed together as 4-byte stores in NEON code.
831           // C. Simulate between-register interleaving.
832           for (int x = 0; x < 4; ++x) {
833             for (int y = 0; y < 4; ++y) {
834               tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
835               tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
836             }
837           }
838           // D. Simulate mangled storage arrangement.
839           memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
840           memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
841           memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
842           memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
843 
844           scratch_data += depth_advance;
845           input_data += 8;
846         }
847         scratch_data += width_advance;
848         input_data += input_depth_skip;
849       }
850       if (width_overall_micro_repeats > input_width_micro_repeats) {
851         TFLITE_DCHECK_EQ(width_overall_micro_repeats,
852                          input_width_micro_repeats + 1);
853         TFLITE_DCHECK_GT(residual_width, 0);
854         // Figure out division of work (available input vs zero-ed).
855         const int adjusted_residual_width = residual_width;
856         // Load, then zero.
857         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
858           // A. Simulate register loading.
859           for (int x = 0; x < adjusted_residual_width; ++x) {
860             for (int s = 0; s < 2; ++s) {
861               for (int d = 0; d < 4; ++d) {
862                 tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
863                                     kSymmetricZeroPoint;
864               }
865             }
866           }
867           for (int x = adjusted_residual_width; x < 4; ++x) {
868             for (int s = 0; s < 2; ++s) {
869               for (int d = 0; d < 4; ++d) {
870                 tmp_load[x][s][d] = 0;
871               }
872             }
873           }
874           // B. Simulate between-register transposition.
875           for (int x = 0; x < 4; ++x) {
876             for (int y = 0; y < 4; ++y) {
877               tmp_transposed[x][0][y] = tmp_load[y][0][x];
878               tmp_transposed[x][1][y] = tmp_load[y][1][x];
879             }
880           }
881 
882           // C and D are to be performed together as 4-byte stores in NEON code.
883           // C. Simulate between-register interleaving.
884           for (int x = 0; x < 4; ++x) {
885             for (int y = 0; y < 4; ++y) {
886               tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
887               tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
888             }
889           }
890           // D. Simulate mangled storage arrangement.
891           memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
892           memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
893           memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
894           memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
895 
896           scratch_data += depth_advance;
897           input_data += 8;
898         }
899         scratch_data += width_advance;
900         input_data += input_depth_skip;
901       }
902       scratch_data += height_advance;
903     }
904 
905     TFLITE_DCHECK_EQ(scratch_data, scratch_block_data +
906                                        block_height * workspace_height_stride);
907   }
908 };
909 
910 template <QuantizationType quantization_type>
911 struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
912                       quantization_type,
913                       DepthwiseConvDepthMultiplication::kNoMultiplication,
914                       /*max_padding=*/1> {
915   static inline void Run(
916       int32 height_block_number, int32 width_block_number,
917       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
918           input_block_data,
919       int8* scratch_block_data,
920       const DepthwiseConvDotProdParams* function_params) {
921     // Just use C model code for case of padding. Optimized versions merge the
922     // modifications therein to handle padding.
923     PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
924                    quantization_type,
925                    DepthwiseConvDepthMultiplication::kNoMultiplication,
926                    /*max_padding=*/1>::Run(height_block_number,
927                                            width_block_number, input_block_data,
928                                            scratch_block_data, function_params);
929   }
930 };
931 
932 template <QuantizationType quantization_type, int32 max_padding>
933 struct PackMacroBlock<
934     DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
935     DepthwiseConvDepthMultiplication::kUnitInputDepth, max_padding> {
936   static inline void Run(
937       int32 height_block_number, int32 width_block_number,
938       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
939           input_block_data,
940       int8* scratch_block_data,
941       const DepthwiseConvDotProdParams* function_params) {
942     const int workspace_height_stride =
943         function_params->workspace_height_stride;
944     const int width_overall_micro_repeats =
945         function_params->input_width_overall_micro_repeats;
946     const int input_width_micro_repeats =
947         function_params->input_width_micro_repeats;
948     const int block_height = function_params->inbound_block_height;
949     const int residual_width = function_params->residual_width;
950     const int input_height_stride = function_params->input_height_stride;
951 
952     const int padding_left = function_params->padding_left;
953     const int padding_right = function_params->padding_right;
954     const int padding_top = function_params->padding_top;
955     const int padding_bottom = function_params->padding_bottom;
956 
957     constexpr int kSymmetricZeroPoint =
958         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
959 
960     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
961 
962     const bool leading_width_padding =
963         padding_left > 0 && width_block_number == 0;
964     const bool trailing_width_padding =
965         padding_right > 0 &&
966         width_block_number == (function_params->width_macro_count - 1);
967     const bool leading_height_padding =
968         padding_top > 0 && height_block_number < 0;
969     const bool trailing_height_padding =
970         padding_bottom > 0 &&
971         height_block_number == (function_params->height_macro_count - 1);
972 
973     const int32 input_offset = function_params->input_offset;
974     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
975 
976     // Work through one slice, by row, at a time.
977     int8* scratch_data_base = scratch_block_data;
978 
979     int copy_block_height = block_height;
980     if (leading_height_padding) {
981       copy_block_height -= 1;
982       memset(scratch_data_base, -input_offset_difference,
983              workspace_height_stride + kWorkspaceExtension);
984       scratch_data_base += workspace_height_stride;
985       input_block_data += input_height_stride;
986     }
987     if (trailing_height_padding) {
988       copy_block_height -= 1;
989     }
990 
991     int adjusted_residual_width =
992         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
993                                                                 : 4;
994 
995     if (trailing_width_padding) {
996       adjusted_residual_width -= 1;
997     }
998     int start_width = 0;
999     if (leading_width_padding) {
1000       start_width = 1;
1001       input_block_data += 1;
1002     }
1003 
1004     const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1005                           adjusted_residual_width - start_width;
1006     // Adjusted so that later conditionals are simplified.
1007     const int copy_size_adjusted =
1008         trailing_width_padding ? copy_size + 1 : copy_size;
1009 
1010     TFLITE_DCHECK_LE(
1011         copy_size,
1012         input_height_stride - width_block_number * input_width_micro_repeats);
1013     // We may drop up to stride-1 of trailing input.
1014     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1015 
1016     // This is used to simulate what should happen in registers.
1017     int8 tmp_data[16];
1018 
1019     int scratch_data_offset = 0;
1020     int input_block_offset = 0;
1021 
1022     if (copy_size >= 16) {
1023       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1024         // Work through one slice, by row, at a time.
1025         int8* scratch_data = scratch_data_base + scratch_data_offset;
1026 
1027         int copy_done = 0;
1028 
1029         // The surrounding condition ensures that we always need at least one
1030         // iteration of the main copy loop. In the case of leading width
1031         // padding, we unroll this specially.
1032         if (leading_width_padding) {
1033           memcpy(tmp_data + 1, input_block_data + input_block_offset, 15);
1034           for (int i = 0; i < 16; ++i) {
1035             tmp_data[i] += -kSymmetricZeroPoint;
1036           }
1037           tmp_data[0] = -input_offset_difference;
1038           memcpy(scratch_data, tmp_data, 16);
1039           copy_done += 15;
1040         }
1041 
1042         // Main copy loop.
1043         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
1044           memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1045                  16);
1046           for (int i = 0; i < 16; ++i) {
1047             tmp_data[i] += -kSymmetricZeroPoint;
1048           }
1049           TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
1050           memcpy(&scratch_data[start_width + copy_done], tmp_data, 16);
1051         }
1052 
1053         const int copy_remaining = copy_size - copy_done;
1054         // Total amount
1055         // = copy_size - copy_done + 4 - adjusted_residual_width
1056         // = width_overall_micro_repeats * 4 - start_width - copy_done.
1057         // Undone micro blocks
1058         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1059 
1060         // Conditional is (copy_remaining > 0 || trailing_width_padding).
1061         if (copy_done < copy_size_adjusted) {
1062           // Employ overlapping-load strategy in order to load full register,
1063           // but use only part.
1064           memcpy(tmp_data,
1065                  input_block_data + input_block_offset + copy_done -
1066                      (16 - copy_remaining),
1067                  16);
1068           // Shift to select the part that we need.
1069           for (int i = 0; i < copy_remaining; ++i) {
1070             tmp_data[i] = tmp_data[(16 - copy_remaining) + i];
1071           }
1072           for (int i = 0; i < 16; ++i) {
1073             tmp_data[i] += -kSymmetricZeroPoint;
1074           }
1075           // Apply padding to remainder, some unnecessary but costless in regs.
1076           for (int i = copy_remaining; i < 16; ++i) {
1077             tmp_data[i] = -input_offset_difference;
1078           }
1079           const int final_repeats =
1080               width_overall_micro_repeats - (start_width + copy_done) / 4;
1081           for (int i = 0; i < final_repeats; ++i) {
1082             memcpy(&scratch_data[start_width + copy_done], tmp_data + 4 * i, 4);
1083             copy_done += 4;
1084           }
1085         }
1086         memset(scratch_data + start_width + copy_done, -input_offset_difference,
1087                kWorkspaceExtension);
1088 
1089         scratch_data_offset += workspace_height_stride;
1090         input_block_offset += input_height_stride;
1091       }
1092     } else if (copy_size >= 4) {
1093       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1094         // Work through one slice, by row, at a time.
1095         int8* scratch_data = scratch_data_base + scratch_data_offset;
1096 
1097         int copy_done = 0;
1098 
1099         // The surrounding condition ensures that we always need at least one
1100         // iteration of the main copy loop. In the case of leading width
1101         // padding, we unroll this specially.
1102         if (leading_width_padding) {
1103           memcpy(tmp_data + 1, input_block_data + input_block_offset, 3);
1104           for (int i = 0; i < 4; ++i) {
1105             tmp_data[i] += -kSymmetricZeroPoint;
1106           }
1107           tmp_data[0] = -input_offset_difference;
1108           memcpy(scratch_data, tmp_data, 4);
1109           copy_done += 3;
1110         }
1111 
1112         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
1113           memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
1114                  4);
1115           for (int i = 0; i < 4; ++i) {
1116             tmp_data[i] += -kSymmetricZeroPoint;
1117           }
1118           // Perform as 4 int32 stores, because that is our alignment.
1119           memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1120         }
1121 
1122         // Total amount
1123         // = copy_size - copy_done + 4 - adjusted_residual_width
1124         // = width_overall_micro_repeats * 4 - start_width - copy_done.
1125         // Undone micro blocks
1126         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
1127         const int copy_remaining = copy_size - copy_done;
1128         // Conditional is (copy_remaining > 0 || trailing_width_padding).
1129         if (copy_done < copy_size_adjusted) {
1130           TFLITE_DCHECK_LT(copy_remaining, 4);
1131           // Employ overlapping-load strategy in order to load full register,
1132           // but use only part.
1133           memcpy(tmp_data,
1134                  input_block_data + input_block_offset + copy_done -
1135                      (4 - copy_remaining),
1136                  4);
1137           // Shift to select the part that we need.
1138           for (int i = 0; i < copy_remaining; ++i) {
1139             tmp_data[i] = tmp_data[(4 - copy_remaining) + i];
1140           }
1141           for (int i = 0; i < 4; ++i) {
1142             tmp_data[i] += -kSymmetricZeroPoint;
1143           }
1144           // Apply padding to remainder, some unnecessary but costless in regs.
1145           for (int i = copy_remaining; i < 4; ++i) {
1146             tmp_data[i] = -input_offset_difference;
1147           }
1148           memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
1149           copy_done += 4;
1150         }
1151         memset(scratch_data + start_width + copy_done, -input_offset_difference,
1152                kWorkspaceExtension);
1153 
1154         scratch_data_offset += workspace_height_stride;
1155         input_block_offset += input_height_stride;
1156       }
1157     } else if (width_overall_micro_repeats == 2) {
1158       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1159         // Apply padding by quick fill of whole reg.
1160         for (int i = 0; i < 8; ++i) {
1161           tmp_data[i] = -input_offset;
1162         }
1163         for (int i = 0; i < copy_size; ++i) {
1164           // Apply shift-left insert, tmp_data as both operands.
1165           // The zero-index byte is left unchanged.
1166           for (int i = 7; i > 0; --i) {
1167             tmp_data[i] = tmp_data[i - 1];
1168           }
1169           tmp_data[1] =
1170               input_block_data[input_block_offset + (copy_size - 1 - i)];
1171         }
1172         if (!leading_width_padding) {
1173           // Remove leading padding, junking trailing byte, OK because max size
1174           // is less than 8.
1175           TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1176           for (int i = 0; i < 7; ++i) {
1177             tmp_data[i] = tmp_data[i + 1];
1178           }
1179         }
1180         for (int i = 0; i < 8; ++i) {
1181           tmp_data[i] += -kSymmetricZeroPoint;
1182         }
1183         memcpy(scratch_data_base + scratch_data_offset, tmp_data, 8);
1184         memset(scratch_data_base + scratch_data_offset + 8,
1185                -input_offset_difference, kWorkspaceExtension);
1186 
1187         scratch_data_offset += workspace_height_stride;
1188         input_block_offset += input_height_stride;
1189       }
1190     } else {
1191       TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
1192       // This path is basically the same as the preceding, 2-micro-block one,
1193       // but here we simply store fewer bytes.
1194       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1195         // Apply padding by quick fill of whole reg.
1196         for (int i = 0; i < 8; ++i) {
1197           tmp_data[i] = -input_offset;
1198         }
1199         for (int i = 0; i < copy_size; ++i) {
1200           // Apply shift-left insert, tmp_data as both operands.
1201           // The zero-index byte is left unchanged.
1202           for (int i = 7; i > 0; --i) {
1203             tmp_data[i] = tmp_data[i - 1];
1204           }
1205           tmp_data[1] =
1206               input_block_data[input_block_offset + (copy_size - 1 - i)];
1207         }
1208         if (!leading_width_padding) {
1209           // Remove leading padding, junking trailing byte, OK because max size
1210           // is less than 8.
1211           TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
1212           for (int i = 0; i < 7; ++i) {
1213             tmp_data[i] = tmp_data[i + 1];
1214           }
1215         }
1216         for (int i = 0; i < 8; ++i) {
1217           tmp_data[i] += -kSymmetricZeroPoint;
1218         }
1219         memcpy(scratch_data_base + scratch_data_offset, tmp_data, 4);
1220         memset(scratch_data_base + scratch_data_offset + 4,
1221                -input_offset_difference, kWorkspaceExtension);
1222 
1223         scratch_data_offset += workspace_height_stride;
1224         input_block_offset += input_height_stride;
1225       }
1226     }
1227 
1228     scratch_data_base += copy_block_height * workspace_height_stride;
1229 
1230     if (trailing_height_padding) {
1231       memset(scratch_data_base, -input_offset_difference,
1232              workspace_height_stride + kWorkspaceExtension);
1233       scratch_data_base += workspace_height_stride;
1234     }
1235 
1236     TFLITE_DCHECK_EQ(
1237         scratch_data_base,
1238         scratch_block_data + block_height * workspace_height_stride);
1239   }
1240 };
1241 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
1242 // of templated functions are selected.
1243 //
1244 // End of code section containing intermediate code transformation.
1245 
1246 #ifdef USE_NEON
1247 template <QuantizationType quantization_type>
1248 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1249                       quantization_type,
1250                       DepthwiseConvDepthMultiplication::kNoMultiplication,
1251                       /*max_padding=*/0> {
1252   static inline void PackMacroBlockIntrinsics(
1253       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1254           input_block_data,
1255       int8* scratch_block_data,
1256       const DepthwiseConvDotProdParams* function_params) {
1257     TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
1258     TFLITE_DCHECK_EQ(function_params->padding_top, 0);
1259     TFLITE_DCHECK_EQ(function_params->padding_left, 0);
1260     TFLITE_DCHECK_EQ(function_params->padding_right, 0);
1261     const int workspace_height_stride =
1262         function_params->workspace_height_stride;
1263     const int width_overall_micro_repeats =
1264         function_params->input_width_overall_micro_repeats;
1265     const int input_width_micro_repeats =
1266         function_params->input_width_micro_repeats;
1267     const int depth_micro_repeats = function_params->depth_micro_repeats;
1268     const int block_height = function_params->inbound_block_height;
1269     const int residual_width = function_params->residual_width;
1270     const int input_height_stride = function_params->input_height_stride;
1271     const int input_depth = function_params->input_depth;
1272 
1273     TFLITE_DCHECK_GE(depth_micro_repeats, 0);
1274     constexpr uint8 kSignBit =
1275         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1276     const int micro_block_size = 4 * 8;
1277     const int depth_advance = width_overall_micro_repeats * micro_block_size;
1278     const int width_advance =
1279         micro_block_size *
1280         (1 - depth_micro_repeats * width_overall_micro_repeats);
1281     const int height_advance = workspace_height_stride -
1282                                width_overall_micro_repeats * micro_block_size;
1283     const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1284 
1285     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1286     // code. Note the blocks of 4x4 are still interleaved down the depth.
1287     int8x16_t work_reg_a;
1288     int8x16_t work_reg_b;
1289 
1290     // Effect subtraction of zero-point = 128 by XOR of sign bit.
1291     const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
1292 
1293     // Work through one slice, by row, at a time.
1294     int8* scratch_data_0 = scratch_block_data;
1295 
1296     for (int k_height = 0; k_height < block_height; ++k_height) {
1297       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1298           input_data_0 = input_block_data;
1299       int8x16_t input_data_a;
1300       int8x16_t input_data_b;
1301       int8x16_t input_data_c;
1302       int8x16_t input_data_d;
1303 
1304       // Traverse the width one point at a time, but the depth in (micro) blocks
1305       // of size 8.
1306       //
1307       // The depth and width margins, which are filled with "zeros", may be
1308       // larger than is strictly needed to calculate output. This is because the
1309       // conv calculation is performed across complete micro blocks.
1310       for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
1311         int8x16_t work_reg_a_sp;
1312         int8x16_t work_reg_b_sp;
1313 
1314         int i_depth = 0;
1315 
1316         if (depth_micro_repeats >= 2) {
1317           i_depth += 2;
1318 
1319           input_data_a = util_vld1q_x8(input_data_0);
1320           input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1321           input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1322           input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1323           input_data_0 += 16;
1324 
1325           for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1326             work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1327             work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1328             vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1329             if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1330               work_reg_a = veorq_s8(work_reg_a, sign_bit);
1331               work_reg_b = veorq_s8(work_reg_b, sign_bit);
1332             }
1333 
1334             work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1335             work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1336             vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1337 
1338             input_data_a = util_vld1q_x8(input_data_0);
1339             input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1340             vst1q_s8(scratch_data_0, work_reg_a);
1341             vst1q_s8(scratch_data_0 + 16, work_reg_b);
1342 
1343             scratch_data_0 += depth_advance;
1344 
1345             if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1346               work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1347               work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1348             }
1349 
1350             input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1351             input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1352             vst1q_s8(scratch_data_0, work_reg_a_sp);
1353             vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1354 
1355             scratch_data_0 += depth_advance;
1356             input_data_0 += 16;
1357           }
1358 
1359           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1360           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1361           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1362           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1363             work_reg_a = veorq_s8(work_reg_a, sign_bit);
1364             work_reg_b = veorq_s8(work_reg_b, sign_bit);
1365           }
1366           vst1q_s8(scratch_data_0, work_reg_a);
1367           vst1q_s8(scratch_data_0 + 16, work_reg_b);
1368 
1369           scratch_data_0 += depth_advance;
1370 
1371           work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1372           work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1373           vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1374           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1375             work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1376             work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1377           }
1378 
1379           vst1q_s8(scratch_data_0, work_reg_a_sp);
1380           vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1381 
1382           scratch_data_0 += depth_advance;
1383         }
1384         for (; i_depth < depth_micro_repeats; ++i_depth) {
1385           input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1386           input_data_b =
1387               vld1q_lane_s8x8(input_data_0 + 1 * input_depth, input_data_b, 0);
1388           input_data_c =
1389               vld1q_lane_s8x8(input_data_0 + 2 * input_depth, input_data_c, 0);
1390           input_data_d =
1391               vld1q_lane_s8x8(input_data_0 + 3 * input_depth, input_data_d, 0);
1392           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1393           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1394 
1395           input_data_0 += 8;
1396 
1397           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1398           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1399             work_reg_a = veorq_s8(work_reg_a, sign_bit);
1400             work_reg_b = veorq_s8(work_reg_b, sign_bit);
1401           }
1402 
1403           vst1q_s8(scratch_data_0, work_reg_a);
1404           vst1q_s8(scratch_data_0 + 16, work_reg_b);
1405 
1406           scratch_data_0 += depth_advance;
1407         }
1408         scratch_data_0 += width_advance;
1409         input_data_0 += input_depth_skip;
1410       }
1411       if (width_overall_micro_repeats > input_width_micro_repeats) {
1412         TFLITE_DCHECK_EQ(width_overall_micro_repeats,
1413                          input_width_micro_repeats + 1);
1414         TFLITE_DCHECK_GT(residual_width, 0);
1415         TFLITE_DCHECK_LT(residual_width, 4);
1416         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1417           input_data_c = vdupq_n_u8(kSignBit);
1418           input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1419           input_data_d = vdupq_n_u8(kSignBit);
1420           if (residual_width > 1) {
1421             input_data_b =
1422                 vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
1423             if (residual_width == 3) {
1424               input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1425                                              input_data_c, 0);
1426             }
1427           }
1428           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1429           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1430 
1431           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1432             work_reg_a = veorq_s8(work_reg_a, sign_bit);
1433             work_reg_b = veorq_s8(work_reg_b, sign_bit);
1434           }
1435           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1436 
1437           vst1q_s8(scratch_data_0, work_reg_a);
1438           vst1q_s8(scratch_data_0 + 16, work_reg_b);
1439 
1440           scratch_data_0 += depth_advance;
1441           input_data_0 += 8;
1442         }
1443         scratch_data_0 += width_advance;
1444         input_data_0 += input_depth_skip;
1445       }
1446 
1447       scratch_data_0 += height_advance;
1448       input_block_data += input_height_stride;
1449     }
1450     TFLITE_DCHECK_EQ(
1451         scratch_data_0,
1452         scratch_block_data + block_height * workspace_height_stride);
1453   }
1454 
1455   static inline void Run(
1456       int32 height_block_number, int32 width_block_number,
1457       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1458           input_block_data,
1459       int8* scratch_block_data,
1460       const DepthwiseConvDotProdParams* function_params) {
1461 #ifdef __aarch64__
1462     PreloadInputBlock(input_block_data, function_params);
1463 #endif
1464     PackMacroBlockIntrinsics(input_block_data, scratch_block_data,
1465                              function_params);
1466   }
1467 };
1468 
1469 template <QuantizationType quantization_type>
1470 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1471                       quantization_type,
1472                       DepthwiseConvDepthMultiplication::kNoMultiplication,
1473                       /*max_padding=*/1> {
1474   static inline void PackMacroBlockIntrinsics(
1475       int32 height_block_number, int32 width_block_number,
1476       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1477           input_block_data,
1478       int8* scratch_block_data,
1479       const DepthwiseConvDotProdParams* function_params) {
1480     constexpr uint8 kSignBit =
1481         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1482 
1483     const int workspace_height_stride =
1484         function_params->workspace_height_stride;
1485     const int width_overall_micro_repeats =
1486         function_params->input_width_overall_micro_repeats;
1487     const int input_width_micro_repeats =
1488         function_params->input_width_micro_repeats;
1489     const int depth_micro_repeats = function_params->depth_micro_repeats;
1490     const int block_height = function_params->inbound_block_height;
1491     const int residual_width = function_params->residual_width;
1492     const int input_height_stride = function_params->input_height_stride;
1493     const int input_depth = function_params->input_depth;
1494 
1495     const int padding_left = function_params->padding_left;
1496     const int padding_right = function_params->padding_right;
1497     const int padding_top = function_params->padding_top;
1498     const int padding_bottom = function_params->padding_bottom;
1499 
1500     TFLITE_DCHECK_GT(depth_micro_repeats, 0);
1501     constexpr int kSymmetricZeroPoint =
1502         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1503 
1504     const int micro_block_size = 4 * 8;
1505     const int depth_advance = width_overall_micro_repeats * micro_block_size;
1506     const int width_advance =
1507         micro_block_size *
1508         (1 - depth_micro_repeats * width_overall_micro_repeats);
1509     const int height_advance = workspace_height_stride -
1510                                width_overall_micro_repeats * micro_block_size;
1511     const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
1512 
1513     const bool leading_width_padding =
1514         padding_left > 0 && width_block_number == 0;
1515     const bool trailing_width_padding =
1516         padding_right > 0 &&
1517         width_block_number == (function_params->width_macro_count - 1);
1518     const bool leading_height_padding =
1519         padding_top > 0 && height_block_number < 0;
1520     const bool trailing_height_padding =
1521         padding_bottom > 0 &&
1522         height_block_number == (function_params->height_macro_count - 1);
1523 
1524     const int32 input_offset = function_params->input_offset;
1525     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1526 
1527     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1528     // code. Note the blocks of 4x4 are still interleaved down the depth.
1529     int8x16_t work_reg_a;
1530     int8x16_t work_reg_b;
1531 
1532     // Effect subtraction of zero-point = 128 by XOR of sign bit.
1533     const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
1534 
1535     // Work through one slice, by row, at a time.
1536     int8* scratch_data_0 = scratch_block_data;
1537 
1538     int copy_block_height = block_height;
1539     if (leading_height_padding) {
1540       copy_block_height -= 1;
1541       memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1542       scratch_data_0 += workspace_height_stride;
1543       input_block_data += input_height_stride;
1544     }
1545     if (trailing_height_padding) {
1546       copy_block_height -= 1;
1547     }
1548 
1549     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1550       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1551           input_data_0 = input_block_data;
1552       int8x16_t input_data_a;
1553       int8x16_t input_data_b;
1554       int8x16_t input_data_c;
1555       int8x16_t input_data_d;
1556 
1557       // Traverse the width one point at a time, but the depth in (micro) blocks
1558       // of size 8.
1559       //
1560       // The depth and width margins, which are filled with "zeros", may be
1561       // larger than is strictly needed to calculate output. This is because the
1562       // conv calculation is performed across complete micro blocks.
1563       for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
1564         // Figure out division of work (available input vs zero-ed).
1565         int adjusted_residual_width =
1566             j_width == (input_width_micro_repeats) ? residual_width : 4;
1567 
1568         if (trailing_width_padding &&
1569             j_width == (width_overall_micro_repeats - 1)) {
1570           adjusted_residual_width -= 1;
1571         }
1572         int start_width = 0;
1573         if (leading_width_padding && j_width == 0) {
1574           start_width = 1;
1575         }
1576         if (start_width == 0) {
1577           if (adjusted_residual_width == 4) {
1578             int8x16_t work_reg_a_sp;
1579             int8x16_t work_reg_b_sp;
1580 
1581             int i_depth = 0;
1582 
1583             if (depth_micro_repeats >= 2) {
1584               i_depth += 2;
1585 
1586               input_data_a = util_vld1q_x8(input_data_0);
1587               input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1588               input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1589               input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1590               input_data_0 += 16;
1591 
1592               for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1593                 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1594                 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1595                 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1596                 if (quantization_type ==
1597                     QuantizationType::kNonPerChannelUint8) {
1598                   work_reg_a = veorq_s8(work_reg_a, sign_bit);
1599                   work_reg_b = veorq_s8(work_reg_b, sign_bit);
1600                 }
1601 
1602                 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1603                 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1604                 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1605 
1606                 input_data_a = util_vld1q_x8(input_data_0);
1607                 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1608                 vst1q_s8(scratch_data_0, work_reg_a);
1609                 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1610 
1611                 scratch_data_0 += depth_advance;
1612 
1613                 if (quantization_type ==
1614                     QuantizationType::kNonPerChannelUint8) {
1615                   work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1616                   work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1617                 }
1618 
1619                 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1620                 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1621                 vst1q_s8(scratch_data_0, work_reg_a_sp);
1622                 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1623 
1624                 scratch_data_0 += depth_advance;
1625                 input_data_0 += 16;
1626               }
1627 
1628               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1629               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1630               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1631               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1632                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1633                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1634               }
1635               vst1q_s8(scratch_data_0, work_reg_a);
1636               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1637 
1638               scratch_data_0 += depth_advance;
1639 
1640               work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1641               work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1642               vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1643               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1644                 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1645                 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1646               }
1647 
1648               vst1q_s8(scratch_data_0, work_reg_a_sp);
1649               vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1650 
1651               scratch_data_0 += depth_advance;
1652             }
1653             for (; i_depth < depth_micro_repeats; ++i_depth) {
1654               input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1655               input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1656                                              input_data_b, 0);
1657               input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1658                                              input_data_c, 0);
1659               input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1660                                              input_data_d, 0);
1661               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1662               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1663 
1664               input_data_0 += 8;
1665 
1666               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1667               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1668                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1669                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1670               }
1671 
1672               vst1q_s8(scratch_data_0, work_reg_a);
1673               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1674 
1675               scratch_data_0 += depth_advance;
1676             }
1677             scratch_data_0 += width_advance;
1678             input_data_0 += input_depth_skip;
1679           } else {
1680             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1681             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1682               input_data_a = vdupq_n_u8(-input_offset);
1683               input_data_b = vdupq_n_u8(-input_offset);
1684               input_data_c = vdupq_n_u8(-input_offset);
1685               input_data_d = vdupq_n_u8(-input_offset);
1686               if (adjusted_residual_width > 0) {
1687                 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
1688                 if (adjusted_residual_width > 1) {
1689                   input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1690                                                  input_data_b, 0);
1691                   if (adjusted_residual_width == 3) {
1692                     input_data_c = vld1q_lane_s8x8(
1693                         input_data_0 + 2 * input_depth, input_data_c, 0);
1694                   }
1695                 }
1696               }
1697               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1698               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1699 
1700               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1701                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1702                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1703               }
1704               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1705 
1706               vst1q_s8(scratch_data_0, work_reg_a);
1707               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1708 
1709               scratch_data_0 += depth_advance;
1710               input_data_0 += 8;
1711             }
1712             scratch_data_0 += width_advance;
1713             input_data_0 += input_depth_skip;
1714           }
1715         } else {
1716           if (adjusted_residual_width == 4) {
1717             int8x16_t work_reg_a_sp;
1718             int8x16_t work_reg_b_sp;
1719 
1720             int i_depth = 0;
1721 
1722             if (depth_micro_repeats >= 2) {
1723               i_depth += 2;
1724 
1725               input_data_a = vdupq_n_u8(-input_offset);
1726               input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1727               input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1728               input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1729               input_data_0 += 16;
1730 
1731               for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
1732                 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1733                 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1734                 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1735                 if (quantization_type ==
1736                     QuantizationType::kNonPerChannelUint8) {
1737                   work_reg_a = veorq_s8(work_reg_a, sign_bit);
1738                   work_reg_b = veorq_s8(work_reg_b, sign_bit);
1739                 }
1740 
1741                 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1742                 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1743                 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1744 
1745                 input_data_a = vdupq_n_u8(-input_offset);
1746                 input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
1747                 vst1q_s8(scratch_data_0, work_reg_a);
1748                 vst1q_s8(scratch_data_0 + 16, work_reg_b);
1749 
1750                 scratch_data_0 += depth_advance;
1751 
1752                 if (quantization_type ==
1753                     QuantizationType::kNonPerChannelUint8) {
1754                   work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1755                   work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1756                 }
1757 
1758                 input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
1759                 input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
1760                 vst1q_s8(scratch_data_0, work_reg_a_sp);
1761                 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1762 
1763                 scratch_data_0 += depth_advance;
1764                 input_data_0 += 16;
1765               }
1766 
1767               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1768               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1769               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1770               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1771                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1772                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1773               }
1774               vst1q_s8(scratch_data_0, work_reg_a);
1775               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1776 
1777               scratch_data_0 += depth_advance;
1778 
1779               work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
1780               work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
1781               vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
1782               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1783                 work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
1784                 work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
1785               }
1786 
1787               vst1q_s8(scratch_data_0, work_reg_a_sp);
1788               vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
1789 
1790               scratch_data_0 += depth_advance;
1791             }
1792             for (; i_depth < depth_micro_repeats; ++i_depth) {
1793               input_data_a = vdupq_n_u8(-input_offset);
1794               input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
1795                                              input_data_b, 0);
1796               input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1797                                              input_data_c, 0);
1798               input_data_d = vld1q_lane_s8x8(input_data_0 + 3 * input_depth,
1799                                              input_data_d, 0);
1800               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1801               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1802 
1803               input_data_0 += 8;
1804 
1805               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1806               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1807                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1808                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1809               }
1810 
1811               vst1q_s8(scratch_data_0, work_reg_a);
1812               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1813 
1814               scratch_data_0 += depth_advance;
1815             }
1816             scratch_data_0 += width_advance;
1817             input_data_0 += input_depth_skip;
1818           } else {
1819             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
1820 
1821             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
1822               input_data_a = vdupq_n_u8(-input_offset);
1823               input_data_b = vdupq_n_u8(-input_offset);
1824               input_data_c = vdupq_n_u8(-input_offset);
1825               input_data_d = vdupq_n_u8(-input_offset);
1826               // Skip loading first column.
1827               if (adjusted_residual_width > 1) {
1828                 input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
1829                                                input_data_b, 0);
1830                 if (adjusted_residual_width == 3) {
1831                   input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
1832                                                  input_data_c, 0);
1833                 }
1834               }
1835               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
1836               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
1837 
1838               if (quantization_type == QuantizationType::kNonPerChannelUint8) {
1839                 work_reg_a = veorq_s8(work_reg_a, sign_bit);
1840                 work_reg_b = veorq_s8(work_reg_b, sign_bit);
1841               }
1842               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
1843 
1844               vst1q_s8(scratch_data_0, work_reg_a);
1845               vst1q_s8(scratch_data_0 + 16, work_reg_b);
1846 
1847               scratch_data_0 += depth_advance;
1848               input_data_0 += 8;
1849             }
1850             scratch_data_0 += width_advance;
1851             input_data_0 += input_depth_skip;
1852           }
1853         }
1854       }
1855       scratch_data_0 += height_advance;
1856       input_block_data += input_height_stride;
1857     }
1858 
1859     if (trailing_height_padding) {
1860       memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
1861       scratch_data_0 += workspace_height_stride;
1862     }
1863 
1864     TFLITE_DCHECK_EQ(
1865         scratch_data_0,
1866         scratch_block_data + block_height * workspace_height_stride);
1867   }
1868 
1869   static inline void Run(
1870       int32 height_block_number, int32 width_block_number,
1871       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1872           input_block_data,
1873       int8* scratch_block_data,
1874       const DepthwiseConvDotProdParams* function_params) {
1875 #ifdef __aarch64__
1876     PreloadInputBlock(input_block_data, function_params);
1877 #endif
1878 
1879     PackMacroBlockIntrinsics(height_block_number, width_block_number,
1880                              input_block_data, scratch_block_data,
1881                              function_params);
1882   }
1883 };
1884 
1885 template <QuantizationType quantization_type>
1886 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
1887                       quantization_type,
1888                       DepthwiseConvDepthMultiplication::kUnitInputDepth,
1889                       /*max_padding=*/1> {
1890   static inline void PackMacroBlockIntrinsics(
1891       int32 height_block_number, int32 width_block_number,
1892       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
1893           input_block_data,
1894       int8* scratch_block_data,
1895       const DepthwiseConvDotProdParams* function_params) {
1896     const int workspace_height_stride =
1897         function_params->workspace_height_stride;
1898     const int width_overall_micro_repeats =
1899         function_params->input_width_overall_micro_repeats;
1900     const int input_width_micro_repeats =
1901         function_params->input_width_micro_repeats;
1902     const int block_height = function_params->inbound_block_height;
1903     const int residual_width = function_params->residual_width;
1904     const int input_height_stride = function_params->input_height_stride;
1905 
1906     const int padding_left = function_params->padding_left;
1907     const int padding_right = function_params->padding_right;
1908     const int padding_top = function_params->padding_top;
1909     const int padding_bottom = function_params->padding_bottom;
1910 
1911     constexpr int kSymmetricZeroPoint =
1912         QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
1913 
1914     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
1915 
1916     const bool leading_width_padding =
1917         padding_left > 0 && width_block_number == 0;
1918     const bool trailing_width_padding =
1919         padding_right > 0 &&
1920         width_block_number == (function_params->width_macro_count - 1);
1921     const bool leading_height_padding =
1922         padding_top > 0 && height_block_number < 0;
1923     const bool trailing_height_padding =
1924         padding_bottom > 0 &&
1925         height_block_number == (function_params->height_macro_count - 1);
1926 
1927     const int32 input_offset = function_params->input_offset;
1928     const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
1929 
1930     // Work through one slice, by row, at a time.
1931     int8* scratch_data_base = scratch_block_data;
1932 
1933     int copy_block_height = block_height;
1934     if (leading_height_padding) {
1935       copy_block_height -= 1;
1936       memset(scratch_data_base, -input_offset_difference,
1937              workspace_height_stride + kWorkspaceExtension);
1938       scratch_data_base += workspace_height_stride;
1939       input_block_data += input_height_stride;
1940     }
1941     if (trailing_height_padding) {
1942       copy_block_height -= 1;
1943     }
1944 
1945     int adjusted_residual_width =
1946         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
1947                                                                 : 4;
1948 
1949     if (trailing_width_padding) {
1950       adjusted_residual_width -= 1;
1951     }
1952     int start_width = 0;
1953     if (leading_width_padding) {
1954       start_width = 1;
1955       input_block_data += 1;
1956     }
1957 
1958     const int copy_size = (width_overall_micro_repeats - 1) * 4 +
1959                           adjusted_residual_width - start_width;
1960     // Adjusted so that later conditionals are simplified.
1961     const int copy_size_adjusted =
1962         trailing_width_padding ? copy_size + 1 : copy_size;
1963 
1964     TFLITE_DCHECK_LE(
1965         copy_size,
1966         input_height_stride - width_block_number * input_width_micro_repeats);
1967     // We may drop up to stride-1 of trailing input.
1968     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
1969 
1970     int scratch_data_offset = 0;
1971     int input_block_offset = 0;
1972 
1973     constexpr uint8 kSignBit =
1974         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
1975 
1976     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
1977     // code. Note the blocks of 4x4 are still interleaved down the depth.
1978     int8x16_t work_reg;
1979     int8x8_t half_work_reg;
1980     int8x8_t padding_mask;
1981 
1982     // Effect subtraction of zero-point = 128 by XOR of sign bit.
1983     const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
1984     const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
1985     padding_mask = vdup_n_s8(-1);
1986     half_work_reg = vdup_n_s8(0);
1987 
1988     if (copy_size >= 16) {
1989       const int copy_remaining = (copy_size + start_width) & 0x7;
1990       padding_mask = vreinterpret_s8_s64(vshl_s64(
1991           vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
1992 
1993       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
1994         // Work through one slice, by row, at a time.
1995         int8* scratch_data = scratch_data_base + scratch_data_offset;
1996 
1997         int copy_done = 0;
1998 
1999         // The surrounding condition ensures that we always need at least one
2000         // iteration of the main copy loop. In the case of leading width
2001         // padding, we unroll this specially.
2002         if (leading_width_padding) {
2003           work_reg = util_vld1q_x8(input_block_data + input_block_offset);
2004           work_reg = vextq_s8(padding_reg, work_reg, 15);
2005           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2006             work_reg = veorq_s8(work_reg, sign_bit);
2007           }
2008           vst1q_s8(scratch_data, work_reg);
2009           copy_done += 15;
2010         }
2011 
2012         // Main copy loop.
2013         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2014           work_reg =
2015               util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2016           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2017             work_reg = veorq_s8(work_reg, sign_bit);
2018           }
2019           TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
2020           vst1q_s8(scratch_data + start_width + copy_done, work_reg);
2021         }
2022 
2023         if (copy_done + 8 <= copy_size) {
2024           half_work_reg =
2025               util_vld1_x8(input_block_data + input_block_offset + copy_done);
2026           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2027             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2028           }
2029           TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2030           vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2031           copy_done += 8;
2032         }
2033 
2034         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2035         // Total amount
2036         // = copy_size - copy_done + 4 - adjusted_residual_width
2037         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2038         // Undone micro blocks
2039         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2040 
2041         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2042         if (copy_done < copy_size_adjusted) {
2043           // Employ overlapping-load strategy in order to load full register,
2044           // but use only part.
2045           // This has the advantage of resulting in zeros after shifting.
2046           half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2047                                        copy_size - 8);
2048 
2049           half_work_reg = vreinterpret_s8_s64(
2050               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2051                        vdup_n_s64(-8 * (8 - copy_remaining))));
2052           half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2053                                   vget_low_s8(padding_reg), half_work_reg);
2054 
2055           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2056             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2057           }
2058           TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
2059           vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2060         }
2061 
2062         // Trailing guard.
2063         vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
2064         vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
2065 
2066         scratch_data_offset += workspace_height_stride;
2067         input_block_offset += input_height_stride;
2068       }
2069     } else if (copy_size >= 4) {
2070       const int copy_remaining = (copy_size + start_width) & 0x3;
2071       padding_mask = vreinterpret_s8_s64(vshl_s64(
2072           vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2073 
2074       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2075         // Work through one slice, by row, at a time.
2076         int8* scratch_data = scratch_data_base + scratch_data_offset;
2077 
2078         int copy_done = 0;
2079 
2080         // The surrounding condition ensures that we always need at least one
2081         // iteration of the main copy loop. In the case of leading width
2082         // padding, we unroll this specially.
2083         if (leading_width_padding) {
2084           half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
2085                                         half_work_reg, 0);
2086           half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
2087           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2088             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2089           }
2090           vst1_lane_8x4(scratch_data, half_work_reg, 0);
2091           copy_done += 3;
2092         }
2093 
2094         // Main copy loop.
2095         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2096           half_work_reg =
2097               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2098                             half_work_reg, 0);
2099           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2100             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2101           }
2102           TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2103           vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
2104                         0);
2105         }
2106 
2107         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2108         // Total amount
2109         // = copy_size - copy_done + 4 - adjusted_residual_width
2110         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2111         // Undone micro blocks
2112         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2113 
2114         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2115         if (copy_done < copy_size_adjusted) {
2116           TFLITE_DCHECK_LT(copy_remaining, 4);
2117           // Employ overlapping-load strategy in order to load full register,
2118           // but use only part.
2119           // This has the advantage of resulting in zeros after shifting.
2120           half_work_reg = vld1_lane_8x4(
2121               input_block_data + input_block_offset + copy_size - 4,
2122               half_work_reg, 0);
2123 
2124           half_work_reg = vreinterpret_s8_s64(
2125               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2126                        vdup_n_s64(-8 * (4 - copy_remaining))));
2127           half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2128                                   vget_low_s8(padding_reg), half_work_reg);
2129 
2130           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2131             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2132           }
2133           TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
2134           vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
2135                         0);
2136           copy_done += 4;
2137         }
2138         // Trailing guard.
2139         vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
2140         vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
2141                       0);
2142         vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
2143                       0);
2144         vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
2145                       half_work_reg, 0);
2146 
2147         scratch_data_offset += workspace_height_stride;
2148         input_block_offset += input_height_stride;
2149       }
2150     } else if (width_overall_micro_repeats == 2) {
2151       // Special case of 1 + 3 + 1, padding + copy + padding.
2152       // This is rarely executed in practice.
2153       TFLITE_DCHECK_EQ(copy_size, 3);
2154       TFLITE_DCHECK_EQ(start_width, 1);
2155       TFLITE_DCHECK(leading_width_padding);
2156       TFLITE_DCHECK(trailing_width_padding);
2157 
2158       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2159         half_work_reg = vdup_n_u8(-input_offset);
2160         half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
2161                                          input_block_data + input_block_offset),
2162                                      half_work_reg, 1);
2163         half_work_reg =
2164             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2165                                                        input_block_offset + 1),
2166                          half_work_reg, 2);
2167         half_work_reg =
2168             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
2169                                                        input_block_offset + 2),
2170                          half_work_reg, 3);
2171 
2172         if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2173           half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2174         }
2175         TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
2176         vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
2177 
2178         // Trailing guard.
2179         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
2180                       half_work_reg, 0);
2181         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
2182                       half_work_reg, 0);
2183         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
2184                       half_work_reg, 0);
2185         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
2186                       half_work_reg, 0);
2187 
2188         scratch_data_offset += workspace_height_stride;
2189         input_block_offset += input_height_stride;
2190       }
2191     } else {
2192       TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2193       const int copy_remaining = (copy_size + start_width) & 0x3;
2194       padding_mask = vreinterpret_s8_s64(vshl_s64(
2195           vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
2196       if (leading_width_padding) {
2197         padding_mask = vset_lane_u8(255, padding_mask, 0);
2198       }
2199 
2200       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2201         for (int i = 0; i < copy_size; ++i) {
2202           half_work_reg = vreinterpret_s8_s64(
2203               vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2204           half_work_reg = vld1_lane_s8(
2205               reinterpret_cast<const int8*>(
2206                   input_block_data + input_block_offset + copy_size - 1 - i),
2207               half_work_reg, 0);
2208         }
2209         if (leading_width_padding) {
2210           half_work_reg = vreinterpret_s8_s64(
2211               vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2212         }
2213         half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
2214                                 vget_low_s8(padding_reg), half_work_reg);
2215 
2216         if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2217           half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2218         }
2219         TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2220         vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2221                       0);
2222 
2223         // Trailing guard.
2224         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
2225                       half_work_reg, 0);
2226         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
2227                       half_work_reg, 0);
2228         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
2229                       half_work_reg, 0);
2230         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
2231                       half_work_reg, 0);
2232 
2233         scratch_data_offset += workspace_height_stride;
2234         input_block_offset += input_height_stride;
2235       }
2236     }
2237 
2238     scratch_data_base += copy_block_height * workspace_height_stride;
2239 
2240     if (trailing_height_padding) {
2241       memset(scratch_data_base, -input_offset_difference,
2242              workspace_height_stride + kWorkspaceExtension);
2243       scratch_data_base += workspace_height_stride;
2244     }
2245 
2246     TFLITE_DCHECK_EQ(
2247         scratch_data_base,
2248         scratch_block_data + block_height * workspace_height_stride);
2249   }
2250 
2251   static inline void Run(
2252       int32 height_block_number, int32 width_block_number,
2253       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2254           input_block_data,
2255       int8* scratch_block_data,
2256       const DepthwiseConvDotProdParams* function_params) {
2257 #ifdef __aarch64__
2258     PreloadInputBlock(input_block_data, function_params);
2259 #endif
2260 
2261     PackMacroBlockIntrinsics(height_block_number, width_block_number,
2262                              input_block_data, scratch_block_data,
2263                              function_params);
2264   }
2265 };
2266 
2267 template <QuantizationType quantization_type>
2268 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
2269                       quantization_type,
2270                       DepthwiseConvDepthMultiplication::kUnitInputDepth,
2271                       /*max_padding=*/0> {
2272   static inline void PackMacroBlockIntrinsics(
2273       int32 height_block_number, int32 width_block_number,
2274       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2275           input_block_data,
2276       int8* scratch_block_data,
2277       const DepthwiseConvDotProdParams* function_params) {
2278     const int workspace_height_stride =
2279         function_params->workspace_height_stride;
2280     const int width_overall_micro_repeats =
2281         function_params->input_width_overall_micro_repeats;
2282     const int input_width_micro_repeats =
2283         function_params->input_width_micro_repeats;
2284     const int block_height = function_params->inbound_block_height;
2285     const int residual_width = function_params->residual_width;
2286     const int input_height_stride = function_params->input_height_stride;
2287 
2288     TFLITE_DCHECK_EQ(function_params->padding_left, 0);
2289     TFLITE_DCHECK_EQ(function_params->padding_right, 0);
2290     TFLITE_DCHECK_EQ(function_params->padding_top, 0);
2291     TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
2292 
2293     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
2294 
2295     // Work through one slice, by row, at a time.
2296     int8* scratch_data_base = scratch_block_data;
2297 
2298     const int copy_block_height = block_height;
2299 
2300     int adjusted_residual_width =
2301         input_width_micro_repeats < width_overall_micro_repeats ? residual_width
2302                                                                 : 4;
2303 
2304     const int copy_size =
2305         (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
2306 
2307     TFLITE_DCHECK_LE(
2308         copy_size,
2309         input_height_stride - width_block_number * input_width_micro_repeats);
2310     // We may drop up to stride-1 of trailing input.
2311     TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
2312 
2313     int scratch_data_offset = 0;
2314     int input_block_offset = 0;
2315 
2316     constexpr uint8 kSignBit =
2317         QuantizationTypeImpl<quantization_type>::kUint8SignBit;
2318 
2319     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
2320     // code. Note the blocks of 4x4 are still interleaved down the depth.
2321     int8x16_t work_reg;
2322     int8x8_t half_work_reg;
2323 
2324     // Effect subtraction of zero-point = 128 by XOR of sign bit.
2325     const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
2326     half_work_reg = vdup_n_s8(0);
2327 
2328     if (copy_size >= 16) {
2329       const int copy_remaining = copy_size & 0x7;
2330 
2331       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2332         // Work through one slice, by row, at a time.
2333         int8* scratch_data = scratch_data_base + scratch_data_offset;
2334 
2335         int copy_done = 0;
2336 
2337         // Main copy loop.
2338         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
2339           work_reg =
2340               util_vld1q_x8(input_block_data + input_block_offset + copy_done);
2341           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2342             work_reg = veorq_s8(work_reg, sign_bit);
2343           }
2344           TFLITE_DCHECK_EQ(copy_done % 16, 0);
2345           vst1q_s8(scratch_data + copy_done, work_reg);
2346         }
2347 
2348         if (copy_done + 8 <= copy_size) {
2349           half_work_reg =
2350               util_vld1_x8(input_block_data + input_block_offset + copy_done);
2351           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2352             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2353           }
2354           TFLITE_DCHECK_EQ(copy_done % 8, 0);
2355           vst1_s8(scratch_data + copy_done, half_work_reg);
2356           copy_done += 8;
2357         }
2358 
2359         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2360         // Total amount
2361         // = copy_size - copy_done + 4 - adjusted_residual_width
2362         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2363         // Undone micro blocks
2364         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2365 
2366         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2367         if (copy_done < copy_size) {
2368           // Employ overlapping-load strategy in order to load full register,
2369           // but use only part.
2370           // This has the advantage of resulting in zeros after shifting.
2371           half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
2372                                        copy_size - 8);
2373 
2374           half_work_reg = vreinterpret_s8_s64(
2375               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2376                        vdup_n_s64(-8 * (8 - copy_remaining))));
2377 
2378           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2379             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2380           }
2381           TFLITE_DCHECK_EQ(copy_done % 8, 0);
2382           vst1_s8(scratch_data + copy_done, half_work_reg);
2383           copy_done += 8;
2384         }
2385 
2386         // Trailing guard.
2387         vst1_s8(scratch_data + copy_done, half_work_reg);
2388         vst1_s8(scratch_data + copy_done + 8, half_work_reg);
2389 
2390         scratch_data_offset += workspace_height_stride;
2391         input_block_offset += input_height_stride;
2392       }
2393     } else if (copy_size >= 4) {
2394       const int copy_remaining = copy_size & 0x3;
2395 
2396       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2397         // Work through one slice, by row, at a time.
2398         int8* scratch_data = scratch_data_base + scratch_data_offset;
2399 
2400         int copy_done = 0;
2401 
2402         // Main copy loop.
2403         for (; (copy_done + 4) <= copy_size; copy_done += 4) {
2404           half_work_reg =
2405               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
2406                             half_work_reg, 0);
2407           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2408             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2409           }
2410           TFLITE_DCHECK_EQ(copy_done % 4, 0);
2411           vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
2412         }
2413 
2414         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
2415         // Total amount
2416         // = copy_size - copy_done + 4 - adjusted_residual_width
2417         // = width_overall_micro_repeats * 4 - start_width - copy_done.
2418         // Undone micro blocks
2419         // = width_overall_micro_repeats - (start_width + copy_done) / 4.
2420 
2421         // Conditional is (copy_remaining > 0 || trailing_width_padding).
2422         if (copy_done < copy_size) {
2423           TFLITE_DCHECK_LT(copy_remaining, 4);
2424           // Employ overlapping-load strategy in order to load full register,
2425           // but use only part.
2426           // This has the advantage of resulting in zeros after shifting.
2427           half_work_reg = vld1_lane_8x4(
2428               input_block_data + input_block_offset + copy_size - 4,
2429               half_work_reg, 0);
2430 
2431           half_work_reg = vreinterpret_s8_s64(
2432               vshl_s64(vreinterpret_s64_s8(half_work_reg),
2433                        vdup_n_s64(-8 * (4 - copy_remaining))));
2434 
2435           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
2436             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2437           }
2438           TFLITE_DCHECK_EQ(copy_done % 4, 0);
2439           vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
2440           copy_done += 4;
2441         }
2442         // Trailing guard.
2443         vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
2444         vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
2445         vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
2446         vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
2447 
2448         scratch_data_offset += workspace_height_stride;
2449         input_block_offset += input_height_stride;
2450       }
2451     } else {
2452       TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
2453 
2454       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
2455         for (int i = 0; i < copy_size; ++i) {
2456           half_work_reg = vreinterpret_s8_s64(
2457               vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
2458           half_work_reg = vld1_lane_s8(
2459               reinterpret_cast<const int8*>(
2460                   input_block_data + input_block_offset + copy_size - 1 - i),
2461               half_work_reg, 0);
2462         }
2463 
2464         half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
2465         TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
2466         vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
2467                       0);
2468 
2469         // Trailing guard.
2470         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
2471                       half_work_reg, 0);
2472         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
2473                       half_work_reg, 0);
2474         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
2475                       half_work_reg, 0);
2476         vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
2477                       half_work_reg, 0);
2478 
2479         scratch_data_offset += workspace_height_stride;
2480         input_block_offset += input_height_stride;
2481       }
2482     }
2483 
2484     scratch_data_base += copy_block_height * workspace_height_stride;
2485 
2486     TFLITE_DCHECK_EQ(
2487         scratch_data_base,
2488         scratch_block_data + block_height * workspace_height_stride);
2489   }
2490 
2491   static inline void Run(
2492       int32 height_block_number, int32 width_block_number,
2493       const typename QuantizationTypeImpl<quantization_type>::ExternalType*
2494           input_block_data,
2495       int8* scratch_block_data,
2496       const DepthwiseConvDotProdParams* function_params) {
2497 #ifdef __aarch64__
2498     PreloadInputBlock(input_block_data, function_params);
2499 #endif
2500 
2501     PackMacroBlockIntrinsics(height_block_number, width_block_number,
2502                              input_block_data, scratch_block_data,
2503                              function_params);
2504   }
2505 };
2506 
2507 #endif  // ARM NEON
2508 
2509 // Apply filter to macro block of input data and store results.
2510 //
2511 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2512 template <int32 stride, QuantizationType quantization_type>
2513 struct KernelMacroBlock<
2514     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2515     DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2516   // Construct a width-shifted combination of two input sub-blocks, effectively
2517   // concatenating them.
2518   //
2519   // The filter is applied using sub-blocks. These are in the needed form for
2520   // the first (width) offset. For subsequent offsets, the filter is applied to
2521   // shifted and combined data. The concatentation and shifting herein is fairly
2522   // straightforward, but in the optimized code is an area of creativity in
2523   // design because NEON instructions do not directly support the required
2524   // between-register permutation.
2525   //
2526   // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2527   // move along the width for each output point calculation, data is shifted, in
2528   // essence between two such blocks.
2529   //
2530   // selected_data has format height 3, depth 4, width 4.
2531   //
2532   // When the micro block is trailing (the last across the macro-block width),
2533   // it would be illegal to load the right (next) block, and the no_right_block
2534   // indicates this scenario.
2535   static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
2536                                                int workspace_height_stride,
2537                                                int width_micro_stride,
2538                                                bool no_right_block,
2539                                                const int8* input_block,
2540                                                int8 selected_data[3][4][4]) {
2541     TFLITE_DCHECK_GE(offset, 0);
2542     TFLITE_DCHECK_LT(offset, 4);
2543 
2544     // The input banks have same format as selected_data.
2545     int8 left_bank[3][4][4];
2546     int8 right_bank[3][4][4];
2547 
2548     // Work through one slice, by row, at a time.
2549     for (int k_height = 0; k_height < 3; ++k_height) {
2550       // Simulate demangling of mangled storage arrangement.
2551       const int8* left_input_block =
2552           &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
2553       memcpy(left_bank[k_height][0], left_input_block, 16);
2554       if (no_right_block) {
2555         memset(right_bank[k_height][0], 0, 16);
2556       } else {
2557         const int8* right_input_block =
2558             &input_block[k_height * workspace_height_stride +
2559                          sub_block * 2 * 8 + width_micro_stride];
2560         memcpy(right_bank[k_height][0], right_input_block, 16);
2561       }
2562       for (int depth_index = 0; depth_index < 4; ++depth_index) {
2563         memcpy(selected_data[k_height][depth_index],
2564                &left_bank[k_height][depth_index][offset], 4 - offset);
2565         memcpy(&selected_data[k_height][depth_index][4 - offset],
2566                right_bank[k_height][depth_index], offset);
2567       }
2568     }
2569   }
2570 
2571   // Straight implementation of 3x3 filter within sub-micro block.
2572   static inline void Calculate3x3FilterOutput(
2573       const DepthwiseConvDotProdParams& params, int sub_block,
2574       const int8 selected_data[3][4][4], const int8 filter_bank[3][2][4][4],
2575       const int32* bias_data, uint8 output_values[4]) {
2576     const int32 output_activation_min = params.quantized_activation_min;
2577     const int32 output_activation_max = params.quantized_activation_max;
2578     const int32 output_multiplier = params.output_multiplier;
2579     const int32 output_shift = params.output_shift;
2580     const int32 output_offset = params.output_offset;
2581     for (int d = 0; d < 4; ++d) {
2582       int32 acc = 0;
2583       for (int y = 0; y < 3; ++y) {
2584         for (int x = 0; x < 4; ++x) {
2585           int32 input_val = selected_data[y][d][x];
2586           int32 filter_val = filter_bank[y][sub_block][d][x];
2587           acc += filter_val * input_val;
2588         }
2589       }
2590       acc += bias_data[d];
2591       acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2592           DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2593                                                 output_shift);
2594       acc += output_offset;
2595       acc = std::max(acc, output_activation_min);
2596       acc = std::min(acc, output_activation_max);
2597       output_values[d] = static_cast<uint8>(acc);
2598     }
2599   }
2600 
2601   static inline void Run(const int8* scratch_block_data,
2602                          const int8* filter_workspace, const int32* bias_data,
2603                          uint8* output_block_data,
2604                          const DepthwiseConvDotProdParams* function_params) {
2605     const int workspace_height_stride =
2606         function_params->workspace_height_stride;
2607     const int input_width_overall_micro_repeats =
2608         function_params->input_width_overall_micro_repeats;
2609     const int output_width_micro_repeats =
2610         function_params->output_width_micro_repeats;
2611     const int depth_micro_repeats = function_params->depth_micro_repeats;
2612     const int depth = function_params->input_depth;
2613     const int stride_val = function_params->stride;
2614     const int four_over_stride = function_params->four_over_stride;
2615 
2616     const int output_width_overall_micro_repeats =
2617         function_params->output_width_overall_micro_repeats;
2618     const int block_height = function_params->outbound_block_height;
2619     const int residual_width = function_params->output_residual_width;
2620     const int output_height_stride = function_params->output_height_stride;
2621     constexpr int bias_increment = 4;
2622     TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2623 
2624     TFLITE_DCHECK(depth_micro_repeats > 0);
2625     const int width_micro_stride = 4 * 8;
2626     const int depth_micro_stride =
2627         width_micro_stride * input_width_overall_micro_repeats;
2628 
2629     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2630 
2631     // Simulate NEON-register transposition of subset of filter.
2632     int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
2633     // Simulate NEON-register input data concatenation + sub-selection.
2634     int8 sub_selected_input_data[3][4][4];  // Height 3, depth 4, width 4.
2635     uint8 output_values[4];                 // Depth 4.
2636 
2637     // The outer 3 loops go through all the micro blocks in a macro block, and
2638     // separately treat the two sub-blocks within each micro block.
2639     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2640       memcpy(filter_bank[0][0][0],
2641              filter_workspace + j_depth * shuffled_filter_increment,
2642              shuffled_filter_increment);
2643 
2644       for (int s = 0; s < 2; ++s) {
2645         for (int k_height = 0; k_height < block_height; ++k_height) {
2646           const int8* scratch_data =
2647               scratch_block_data +
2648               workspace_height_stride * k_height * stride_val +
2649               depth_micro_stride * j_depth;
2650           uint8* output_data =
2651               output_block_data + output_height_stride * k_height + 8 * j_depth;
2652 
2653           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2654                ++i_width) {
2655             const int output_width = i_width == output_width_micro_repeats
2656                                          ? residual_width
2657                                          : four_over_stride;
2658             const bool no_right_block = (output_width - 1) * stride_val < 2;
2659             TFLITE_DCHECK_LE(output_width * stride_val, 4);
2660             const int8* input_data =
2661                 scratch_data + width_micro_stride * i_width;
2662             // Iterate over input width shifts within sub-micro blocks.
2663             for (int x = 0; x < output_width; ++x) {
2664               ConcatenateInputSubBlocks(x * stride_val, s,
2665                                         workspace_height_stride,
2666                                         width_micro_stride, no_right_block,
2667                                         input_data, sub_selected_input_data);
2668               Calculate3x3FilterOutput(
2669                   *function_params, s, sub_selected_input_data, filter_bank,
2670                   bias_data + (2 * j_depth + s) * bias_increment,
2671                   output_values);
2672               for (int d = 0; d < 4; ++d) {
2673                 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2674                             d] = output_values[d];
2675               }
2676             }
2677           }
2678         }
2679       }
2680     }
2681   }
2682 };
2683 
2684 // Apply filter to macro block of input data and store results.
2685 //
2686 // Parameters for repeats and residual sizes are in terms of outputs.
2687 //
2688 // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
2689 template <int32 stride, QuantizationType quantization_type>
2690 struct KernelMacroBlock<
2691     DepthwiseConvImplementation::kUseCModel3x3DotProduct, quantization_type,
2692     DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
2693   // Construct a width-shifted combination of two input sub-blocks, effectively
2694   // concatenating them.
2695   //
2696   // The filter is applied using sub-blocks. These are in the needed form for
2697   // the first (width) offset. For subsequent offsets, the filter is applied to
2698   // shifted and combined data. The concatentation and shifting herein is fairly
2699   // straightforward, but in the optimized code is an area of creativity in
2700   // design because NEON instructions do not directly support the required
2701   // between-register permutation.
2702   //
2703   // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
2704   // move along the width for each output point calculation, data is shifted, in
2705   // essence between two such blocks.
2706   //
2707   // selected_data has format height 3, width 4.
2708   //
2709   // When the micro block is trailing (the last across the macro-block width),
2710   // it would be illegal to load the right (next) block, and the no_right_block
2711   // indicates this scenario.
2712   static inline void ConcatenateInputSubBlocks(int offset,
2713                                                int workspace_height_stride,
2714                                                bool no_right_block,
2715                                                const int8* input_block,
2716                                                int8 selected_data[3][4]) {
2717     TFLITE_DCHECK_GE(offset, 0);
2718     TFLITE_DCHECK_LT(offset, 4);
2719     if (no_right_block) {
2720       for (int k_height = 0; k_height < 3; ++k_height) {
2721         memcpy(selected_data[k_height],
2722                &input_block[k_height * workspace_height_stride + offset],
2723                4 - offset);
2724       }
2725     } else {
2726       for (int k_height = 0; k_height < 3; ++k_height) {
2727         memcpy(selected_data[k_height],
2728                &input_block[k_height * workspace_height_stride + offset], 4);
2729       }
2730     }
2731   }
2732 
2733   // Straight implementation of 3x3 filter within sub-micro block.
2734   static inline void Calculate3x3FilterOutput(
2735       const DepthwiseConvDotProdParams& function_params, int sub_block,
2736       const int8 selected_data[3][4], const int8 filter_bank[3][2][4][4],
2737       const int32* bias_data, uint8 output_values[4]) {
2738     const int32 output_activation_min =
2739         function_params.quantized_activation_min;
2740     const int32 output_activation_max =
2741         function_params.quantized_activation_max;
2742     const int32 output_multiplier = function_params.output_multiplier;
2743     const int32 output_shift = function_params.output_shift;
2744     const int32 output_offset = function_params.output_offset;
2745     for (int d = 0; d < 4; ++d) {
2746       int32 acc = 0;
2747       for (int y = 0; y < 3; ++y) {
2748         for (int x = 0; x < 4; ++x) {
2749           int32 input_val = selected_data[y][x];
2750           int32 filter_val = filter_bank[y][sub_block][d][x];
2751           acc += filter_val * input_val;
2752         }
2753       }
2754       acc += bias_data[d];
2755       acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2756           DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
2757                                                 output_shift);
2758       acc += output_offset;
2759       acc = std::max(acc, output_activation_min);
2760       acc = std::min(acc, output_activation_max);
2761       output_values[d] = static_cast<uint8>(acc);
2762     }
2763   }
2764 
2765   static inline void Run(const int8* scratch_block_data,
2766                          const int8* filter_workspace, const int32* bias_data,
2767                          uint8* output_block_data,
2768                          const DepthwiseConvDotProdParams* function_params) {
2769     const int workspace_height_stride =
2770         function_params->workspace_height_stride;
2771     const int output_width_micro_repeats =
2772         function_params->output_width_micro_repeats;
2773     const int depth_micro_repeats = function_params->depth_micro_repeats;
2774     const int depth = function_params->output_depth;
2775     const int stride_val = function_params->stride;
2776     const int four_over_stride = function_params->four_over_stride;
2777 
2778     const int workspace_width_micro_repeats =
2779         function_params->workspace_width_micro_repeats;
2780     const int output_width_overall_micro_repeats =
2781         function_params->output_width_overall_micro_repeats;
2782     const int block_height = function_params->outbound_block_height;
2783     const int residual_width = function_params->output_residual_width;
2784     const int output_height_stride = function_params->output_height_stride;
2785     constexpr int bias_increment = 4;
2786     TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
2787 
2788     TFLITE_DCHECK(depth_micro_repeats > 0);
2789 
2790     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2791 
2792     // Simulate NEON-register transposition of subset of filter.
2793     int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
2794     // Simulate NEON-register input data concatenation + sub-selection.
2795     int8 sub_selected_input_data[3][4];  // Height 3, depth 4, width 4.
2796     uint8 output_values[4];              // Depth 4.
2797 
2798     // The outer 3 loops go through all the micro blocks in a macro block, and
2799     // separately treat the two sub-blocks within each micro block.
2800     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2801       memcpy(filter_bank[0][0][0],
2802              filter_workspace + j_depth * shuffled_filter_increment,
2803              shuffled_filter_increment);
2804 
2805       for (int s = 0; s < 2; ++s) {
2806         for (int k_height = 0; k_height < block_height; ++k_height) {
2807           const int8* scratch_data =
2808               scratch_block_data +
2809               workspace_height_stride * k_height * stride_val;
2810           uint8* output_data =
2811               output_block_data + output_height_stride * k_height + 8 * j_depth;
2812 
2813           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2814                ++i_width) {
2815             const int output_width = i_width == output_width_micro_repeats
2816                                          ? residual_width
2817                                          : four_over_stride;
2818             const bool no_right_block = i_width == output_width_micro_repeats &&
2819                                         output_width_overall_micro_repeats ==
2820                                             workspace_width_micro_repeats;
2821             TFLITE_DCHECK_LE(output_width * stride_val, 4);
2822             const int8* input_data = scratch_data + 4 * i_width;
2823             // Iterate over input width shifts within 4x4 blocks.
2824             for (int x = 0; x < output_width; ++x) {
2825               ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
2826                                         no_right_block, input_data,
2827                                         sub_selected_input_data);
2828               Calculate3x3FilterOutput(
2829                   *function_params, s, sub_selected_input_data, filter_bank,
2830                   bias_data + (2 * j_depth + s) * bias_increment,
2831                   output_values);
2832               for (int d = 0; d < 4; ++d) {
2833                 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2834                             d] = output_values[d];
2835               }
2836             }
2837           }
2838         }
2839       }
2840     }
2841   }
2842 };
2843 
2844 // Beginning of code section containing intermediate code transformation.
2845 //
2846 // This section is only compiled when kUseUnwound3x3DotProduct versions of
2847 // templated functions are selected.
2848 template <int32 stride, QuantizationType quantization_type>
2849 struct KernelMacroBlock<
2850     DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
2851     DepthwiseConvDepthMultiplication::kNoMultiplication, stride> {
2852   static inline void Run(const int8* scratch_block_data,
2853                          const int8* filter_workspace, const int32* bias_data,
2854                          uint8* output_block_data,
2855                          const DepthwiseConvDotProdParams* function_params) {
2856     const int workspace_height_stride =
2857         function_params->workspace_height_stride;
2858     const int input_width_overall_micro_repeats =
2859         function_params->input_width_overall_micro_repeats;
2860     const int output_width_micro_repeats =
2861         function_params->output_width_micro_repeats;
2862     const int depth_micro_repeats = function_params->depth_micro_repeats;
2863     const int depth = function_params->input_depth;
2864     const int stride_val = function_params->stride;
2865     const int four_over_stride = function_params->four_over_stride;
2866 
2867     const int output_width_overall_micro_repeats =
2868         function_params->output_width_overall_micro_repeats;
2869     const int block_height = function_params->outbound_block_height;
2870     const int residual_width = function_params->output_residual_width;
2871     const int output_height_stride = function_params->output_height_stride;
2872     const int bias_increment = function_params->bias_increment;
2873 
2874     TFLITE_DCHECK(depth_micro_repeats > 0);
2875     const int width_micro_stride = 4 * 8;
2876     const int depth_micro_stride =
2877         width_micro_stride * input_width_overall_micro_repeats;
2878 
2879     const int32 output_activation_min =
2880         function_params->quantized_activation_min;
2881     const int32 output_activation_max =
2882         function_params->quantized_activation_max;
2883     const int32 output_multiplier = function_params->output_multiplier;
2884     const int32 output_shift = function_params->output_shift;
2885     const int32 output_offset = function_params->output_offset;
2886 
2887     // Simulate NEON-register transposition of subset of filter.
2888     int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
2889     int8 filter_bank_a_1[4][4];
2890     int8 filter_bank_a_2[4][4];
2891     int8 filter_bank_b_0[4][4];
2892     int8 filter_bank_b_1[4][4];
2893     int8 filter_bank_b_2[4][4];
2894     // Simulate NEON-register input data concatenation + sub-selection.
2895     // Also sub-block, height 3, depth 4, width 4.
2896     uint8 output_values[4];  // Sub-block, depth 4.
2897     // selected_data has format Depth 4, width 4.
2898     int8 left_bank_0[4][4];
2899     int8 left_bank_1[4][4];
2900     int8 left_bank_2[4][4];
2901     int8 right_bank_0[4][4];
2902     int8 right_bank_1[4][4];
2903     int8 right_bank_2[4][4];
2904     memset(right_bank_0[0], 0, 16);
2905     memset(right_bank_1[0], 0, 16);
2906     memset(right_bank_2[0], 0, 16);
2907 
2908     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
2909 
2910     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
2911       const int8* filter_block =
2912           filter_workspace + shuffled_filter_increment * j_depth;
2913 
2914       memcpy(filter_bank_a_0, filter_block, 16);
2915       memcpy(filter_bank_b_0, filter_block + 16, 16);
2916       memcpy(filter_bank_a_1, filter_block + 32, 16);
2917       memcpy(filter_bank_b_1, filter_block + 48, 16);
2918       memcpy(filter_bank_a_2, filter_block + 64, 16);
2919       memcpy(filter_bank_b_2, filter_block + 80, 16);
2920 
2921       for (int s = 0; s < 2; ++s) {
2922         // Work through one slice, by row, at a time.
2923         for (int k_height = 0; k_height < block_height; ++k_height) {
2924           const int8* scratch_data =
2925               scratch_block_data +
2926               workspace_height_stride * k_height * stride_val +
2927               depth_micro_stride * j_depth;
2928           uint8* output_data =
2929               output_block_data + output_height_stride * k_height + 8 * j_depth;
2930           const int8* input_data_0 = scratch_data + s * 2 * 8;
2931 
2932           // Load first sub-micro block of data into operational banks.
2933           memcpy(left_bank_0[0], input_data_0, 16);
2934           memcpy(left_bank_1[0], input_data_0 + workspace_height_stride, 16);
2935           memcpy(left_bank_2[0], input_data_0 + 2 * workspace_height_stride,
2936                  16);
2937 
2938           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
2939                ++i_width) {
2940             const int output_width = i_width == output_width_micro_repeats
2941                                          ? residual_width
2942                                          : four_over_stride;
2943             TFLITE_DCHECK_LE(output_width * stride_val, 4);
2944             const int8* input_data =
2945                 input_data_0 + width_micro_stride * i_width;
2946             const bool no_right_block = (output_width - 1) * stride_val < 2;
2947 
2948             // Load next sub-micro block of data.
2949             if (!no_right_block) {
2950               memcpy(right_bank_0[0], input_data + width_micro_stride, 16);
2951               memcpy(right_bank_1[0],
2952                      input_data + workspace_height_stride + width_micro_stride,
2953                      16);
2954               memcpy(
2955                   right_bank_2[0],
2956                   input_data + 2 * workspace_height_stride + width_micro_stride,
2957                   16);
2958             }
2959 
2960             // Iterate over input width shifts within 4x4 blocks.
2961             for (int x = 0; x < output_width; ++x) {
2962               // Operate on depth of 4 in batches.
2963               for (int d = 0; d < 4; ++d) {
2964                 int32 acc = 0;
2965                 for (int x = 0; x < 4; ++x) {
2966                   int32 input_val = left_bank_0[d][x];
2967                   int32 filter_val = filter_bank_a_0[d][x];
2968                   acc += filter_val * input_val;
2969                 }
2970                 for (int x = 0; x < 4; ++x) {
2971                   int32 input_val = left_bank_1[d][x];
2972                   int32 filter_val = filter_bank_a_1[d][x];
2973                   acc += filter_val * input_val;
2974                 }
2975                 for (int x = 0; x < 4; ++x) {
2976                   int32 input_val = left_bank_2[d][x];
2977                   int32 filter_val = filter_bank_a_2[d][x];
2978                   acc += filter_val * input_val;
2979                 }
2980                 acc += bias_data[d];
2981                 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
2982                     DepthwiseConvOutputRounding::kUpward>(
2983                     acc, output_multiplier, output_shift);
2984                 acc += output_offset;
2985                 acc = std::max(acc, output_activation_min);
2986                 acc = std::min(acc, output_activation_max);
2987                 output_values[d] = static_cast<uint8>(acc);
2988               }
2989 
2990               for (int d = 0; d < 4; ++d) {
2991                 output_data[depth * (four_over_stride * i_width + x) + 4 * s +
2992                             d] = output_values[d];
2993               }
2994 
2995               // Simulate shifting instructions.
2996               if (stride_val == 1) {
2997                 for (int depth_index = 0; depth_index < 4; ++depth_index) {
2998                   for (int z = 0; z < 3; ++z) {
2999                     left_bank_0[depth_index][z] =
3000                         left_bank_0[depth_index][z + 1];
3001                     left_bank_1[depth_index][z] =
3002                         left_bank_1[depth_index][z + 1];
3003                     left_bank_2[depth_index][z] =
3004                         left_bank_2[depth_index][z + 1];
3005                   }
3006                   left_bank_0[depth_index][3] = right_bank_0[depth_index][0];
3007                   left_bank_1[depth_index][3] = right_bank_1[depth_index][0];
3008                   left_bank_2[depth_index][3] = right_bank_2[depth_index][0];
3009                   for (int z = 0; z < 3; ++z) {
3010                     right_bank_0[depth_index][z] =
3011                         right_bank_0[depth_index][z + 1];
3012                     right_bank_1[depth_index][z] =
3013                         right_bank_1[depth_index][z + 1];
3014                     right_bank_2[depth_index][z] =
3015                         right_bank_2[depth_index][z + 1];
3016                   }
3017                 }
3018               } else {
3019                 for (int depth_index = 0; depth_index < 4; ++depth_index) {
3020                   for (int z = 0; z < 2; ++z) {
3021                     left_bank_0[depth_index][z] =
3022                         left_bank_0[depth_index][z + 2];
3023                     left_bank_1[depth_index][z] =
3024                         left_bank_1[depth_index][z + 2];
3025                     left_bank_2[depth_index][z] =
3026                         left_bank_2[depth_index][z + 2];
3027                   }
3028                   left_bank_0[depth_index][2] = right_bank_0[depth_index][0];
3029                   left_bank_1[depth_index][2] = right_bank_1[depth_index][0];
3030                   left_bank_2[depth_index][2] = right_bank_2[depth_index][0];
3031                   left_bank_0[depth_index][3] = right_bank_0[depth_index][1];
3032                   left_bank_1[depth_index][3] = right_bank_1[depth_index][1];
3033                   left_bank_2[depth_index][3] = right_bank_2[depth_index][1];
3034                   for (int z = 0; z < 2; ++z) {
3035                     right_bank_0[depth_index][z] =
3036                         right_bank_0[depth_index][z + 2];
3037                     right_bank_1[depth_index][z] =
3038                         right_bank_1[depth_index][z + 2];
3039                     right_bank_2[depth_index][z] =
3040                         right_bank_2[depth_index][z + 2];
3041                   }
3042                 }
3043               }
3044             }
3045           }
3046         }
3047         bias_data += bias_increment;
3048 
3049         // Move filter for second sub-block into operational filter.
3050         for (int z = 0; z < 4; ++z) {
3051           for (int x = 0; x < 4; ++x) {
3052             filter_bank_a_0[z][x] = filter_bank_b_0[z][x];
3053             filter_bank_a_1[z][x] = filter_bank_b_1[z][x];
3054             filter_bank_a_2[z][x] = filter_bank_b_2[z][x];
3055           }
3056         }
3057       }
3058     }
3059   }
3060 };
3061 
3062 template <int32 stride, QuantizationType quantization_type>
3063 struct KernelMacroBlock<
3064     DepthwiseConvImplementation::kUseUnwound3x3DotProduct, quantization_type,
3065     DepthwiseConvDepthMultiplication::kUnitInputDepth, stride> {
3066   static inline void Run(const int8* scratch_block_data,
3067                          const int8* filter_workspace, const int32* bias_data,
3068                          uint8* output_block_data,
3069                          const DepthwiseConvDotProdParams* function_params) {
3070     const int workspace_height_stride =
3071         function_params->workspace_height_stride;
3072     const int output_width_micro_repeats =
3073         function_params->output_width_micro_repeats;
3074     const int depth_micro_repeats = function_params->depth_micro_repeats;
3075     const int output_depth = function_params->output_depth;
3076     const int stride_val = function_params->stride;
3077     const int four_over_stride = function_params->four_over_stride;
3078 
3079     const int output_width_overall_micro_repeats =
3080         function_params->output_width_overall_micro_repeats;
3081     const int block_height = function_params->outbound_block_height;
3082     const int residual_width = function_params->output_residual_width;
3083     const int output_height_stride = function_params->output_height_stride;
3084     const int bias_increment = function_params->bias_increment;
3085 
3086     const int32 output_activation_min =
3087         function_params->quantized_activation_min;
3088     const int32 output_activation_max =
3089         function_params->quantized_activation_max;
3090     const int32 output_multiplier = function_params->output_multiplier;
3091     const int32 output_shift = function_params->output_shift;
3092     const int32 output_offset = function_params->output_offset;
3093 
3094     TFLITE_DCHECK(depth_micro_repeats > 0);
3095 
3096     TFLITE_DCHECK_EQ(bias_increment, 4);
3097 
3098     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3099 
3100     // Simulate NEON-register transposition of subset of filter.
3101     int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
3102     int8 filter_bank_a_1[4][4];
3103     int8 filter_bank_a_2[4][4];
3104     int8 filter_bank_b_0[4][4];
3105     int8 filter_bank_b_1[4][4];
3106     int8 filter_bank_b_2[4][4];
3107     // Simulate NEON-register input data concatenation + sub-selection.
3108     // Also sub-block, height 3, depth 4, width 4.
3109 
3110     int8 input_bank_0[8];
3111     int8 input_bank_1[8];
3112     int8 input_bank_2[8];
3113 
3114     TFLITE_DCHECK_GE(depth_micro_repeats, 1);
3115 
3116     uint8 output_values[2][4];  // Sub-block, depth 4.
3117 
3118     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3119       memcpy(filter_bank_a_0, filter_workspace, 16);
3120       memcpy(filter_bank_b_0, filter_workspace + 16, 16);
3121       memcpy(filter_bank_a_1, filter_workspace + 32, 16);
3122       memcpy(filter_bank_b_1, filter_workspace + 48, 16);
3123       memcpy(filter_bank_a_2, filter_workspace + 64, 16);
3124       memcpy(filter_bank_b_2, filter_workspace + 80, 16);
3125 
3126       // Work through one slice, by row, at a time.
3127       for (int k_height = 0; k_height < block_height; ++k_height) {
3128         const int8* scratch_data =
3129             scratch_block_data +
3130             workspace_height_stride * k_height * stride_val;
3131         uint8* output_data =
3132             output_block_data + output_height_stride * k_height + 8 * j_depth;
3133 
3134         memcpy(input_bank_0, scratch_data, 4);
3135         memcpy(input_bank_1, scratch_data + workspace_height_stride, 4);
3136         memcpy(input_bank_2, scratch_data + 2 * workspace_height_stride, 4);
3137 
3138         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3139              ++i_width) {
3140           const int output_width = i_width == output_width_micro_repeats
3141                                        ? residual_width
3142                                        : four_over_stride;
3143 
3144           TFLITE_DCHECK_LE(output_width * stride_val, 4);
3145           const int8* input_data = scratch_data + 4 * i_width;
3146 
3147           memcpy(input_bank_0 + 4, input_data + 4, 4);
3148           memcpy(input_bank_1 + 4, input_data + workspace_height_stride + 4, 4);
3149           memcpy(input_bank_2 + 4, input_data + 2 * workspace_height_stride + 4,
3150                  4);
3151 
3152           // Iterate over input width shifts within 4x4 blocks.
3153           for (int w = 0; w < output_width; ++w) {
3154             constexpr int offset =
3155                 0;  // Shift input instead of offset in multiply-accumulate.
3156 
3157             {
3158               const int s = 0;
3159               for (int d = 0; d < 4; ++d) {
3160                 int32 acc = bias_data[s * 4 + d];
3161                 for (int x = 0; x < 4; ++x) {
3162                   int32 input_val_0 = input_bank_0[offset + x];
3163                   int32 filter_val_0 = filter_bank_a_0[d][x];
3164                   acc += filter_val_0 * input_val_0;
3165                   int32 input_val_1 = input_bank_1[offset + x];
3166                   int32 filter_val_1 = filter_bank_a_1[d][x];
3167                   acc += filter_val_1 * input_val_1;
3168                   int32 input_val_2 = input_bank_2[offset + x];
3169                   int32 filter_val_2 = filter_bank_a_2[d][x];
3170                   acc += filter_val_2 * input_val_2;
3171                 }
3172                 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3173                     DepthwiseConvOutputRounding::kUpward>(
3174                     acc, output_multiplier, output_shift);
3175                 acc += output_offset;
3176                 acc = std::max(acc, output_activation_min);
3177                 acc = std::min(acc, output_activation_max);
3178                 output_values[s][d] = static_cast<uint8>(acc);
3179 
3180                 output_data[s * 4 + d] = output_values[s][d];
3181               }
3182             }
3183             {
3184               const int s = 1;
3185               for (int d = 0; d < 4; ++d) {
3186                 int32 acc = bias_data[s * 4 + d];
3187                 for (int x = 0; x < 4; ++x) {
3188                   int32 input_val_0 = input_bank_0[offset + x];
3189                   int32 filter_val_0 = filter_bank_b_0[d][x];
3190                   acc += filter_val_0 * input_val_0;
3191                   int32 input_val_1 = input_bank_1[offset + x];
3192                   int32 filter_val_1 = filter_bank_b_1[d][x];
3193                   acc += filter_val_1 * input_val_1;
3194                   int32 input_val_2 = input_bank_2[offset + x];
3195                   int32 filter_val_2 = filter_bank_b_2[d][x];
3196                   acc += filter_val_2 * input_val_2;
3197                 }
3198                 acc = reference_ops::depthwise_conv::DepthwiseConvRound<
3199                     DepthwiseConvOutputRounding::kUpward>(
3200                     acc, output_multiplier, output_shift);
3201                 acc += output_offset;
3202                 acc = std::max(acc, output_activation_min);
3203                 acc = std::min(acc, output_activation_max);
3204                 output_values[s][d] = static_cast<uint8>(acc);
3205 
3206                 output_data[s * 4 + d] = output_values[s][d];
3207               }
3208             }
3209 
3210             // Simulate register shifts.
3211             for (int i = 0; i < (8 - stride_val); ++i) {
3212               input_bank_0[i] = input_bank_0[i + stride_val];
3213               input_bank_1[i] = input_bank_1[i + stride_val];
3214               input_bank_2[i] = input_bank_2[i + stride_val];
3215             }
3216 
3217             output_data += output_depth;
3218           }
3219         }
3220       }
3221       bias_data += 2 * bias_increment;
3222       filter_workspace += shuffled_filter_increment;
3223     }
3224   }
3225 };
3226 // The preceding section is only compiled when kUseUnwound3x3DotProduct versions
3227 // of templated functions are selected.
3228 //
3229 // End of code section containing intermediate code transformation.
3230 
3231 #ifdef USE_NEON
3232 template <>
3233 struct KernelMacroBlock<
3234     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3235     QuantizationType::kNonPerChannelUint8,
3236     DepthwiseConvDepthMultiplication::kNoMultiplication,
3237     /*stride=*/1> {
3238   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3239   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3240     return vmin_u8(a, b);
3241   }
3242   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3243     return vmax_u8(a, b);
3244   }
3245   static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
3246     return vminq_u8(a, b);
3247   }
3248   static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
3249     return vmaxq_u8(a, b);
3250   }
3251 
3252   static inline void KernelMacroBlockIntrinsics(
3253       const int8* scratch_block_data, const int8* filter_workspace,
3254       const int32* bias_data, uint8* output_block_data,
3255       const DepthwiseConvDotProdParams* function_params) {
3256     static constexpr QuantizationType quantization_type =
3257         QuantizationType::kNonPerChannelUint8;
3258 
3259     const int workspace_height_stride =
3260         function_params->workspace_height_stride;
3261     const int input_width_overall_micro_repeats =
3262         function_params->input_width_overall_micro_repeats;
3263     const int output_width_micro_repeats =
3264         function_params->output_width_micro_repeats;
3265     const int depth_micro_repeats = function_params->depth_micro_repeats;
3266     const int depth = function_params->input_depth;
3267 
3268     const int output_width_overall_micro_repeats =
3269         function_params->output_width_overall_micro_repeats;
3270     const int block_height = function_params->outbound_block_height;
3271     const int residual_width = function_params->output_residual_width;
3272     const int output_height_stride = function_params->output_height_stride;
3273     constexpr int kBiasIncrement = 4;
3274 
3275     TFLITE_DCHECK(depth_micro_repeats > 0);
3276     const int width_micro_stride = 4 * 8;
3277     const int depth_micro_stride =
3278         width_micro_stride * input_width_overall_micro_repeats;
3279 
3280     const int32 output_activation_min =
3281         function_params->quantized_activation_min;
3282     const int32 output_activation_max =
3283         function_params->quantized_activation_max;
3284     const int32 output_multiplier = function_params->output_multiplier;
3285     const int32 output_shift = function_params->output_shift;
3286     const int32 output_offset = function_params->output_offset;
3287     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3288       TFLITE_DCHECK_GE(output_activation_min, 0);
3289       TFLITE_DCHECK_LT(output_activation_min, 256);
3290       TFLITE_DCHECK_GE(output_activation_max, 0);
3291       TFLITE_DCHECK_LT(output_activation_max, 256);
3292     } else {
3293       TFLITE_DCHECK_GE(output_activation_min, -128);
3294       TFLITE_DCHECK_LT(output_activation_min, 128);
3295       TFLITE_DCHECK_GE(output_activation_max, -128);
3296       TFLITE_DCHECK_LT(output_activation_max, 128);
3297     }
3298     TFLITE_DCHECK_GE(output_offset, -32878);
3299     TFLITE_DCHECK_LT(output_offset, 32768);
3300 
3301     const int16x8_t output_offset_vec =
3302         vdupq_n_s16(static_cast<int16>(output_offset));
3303     const uint8x16_t output_activation_min_vec =
3304         vdupq_n_u8(static_cast<uint8>(output_activation_min));
3305     const uint8x16_t output_activation_max_vec =
3306         vdupq_n_u8(static_cast<uint8>(output_activation_max));
3307 
3308     const int8* input_data_depthwise = scratch_block_data;
3309     typename QuantizationTypeImpl<quantization_type>::ExternalType*
3310         output_data_depthwise = output_block_data;
3311     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3312       // Simulate NEON-register transposition of subset of filter.
3313       int8x16_t filter_reg_0_a;
3314       int8x16_t filter_reg_0_b;
3315       int8x16_t filter_reg_1_a;
3316       int8x16_t filter_reg_1_b;
3317       int8x16_t filter_reg_2_a;
3318       int8x16_t filter_reg_2_b;
3319       int8x16_t filter_reg_0_a_shifted;
3320       int8x16_t filter_reg_1_a_shifted;
3321       int8x16_t filter_reg_2_a_shifted;
3322 
3323       filter_reg_0_a = vld1q_s8(filter_workspace);
3324       filter_workspace += 16;
3325       filter_reg_0_b = vld1q_s8(filter_workspace);
3326       filter_workspace += 16;
3327       filter_reg_1_a = vld1q_s8(filter_workspace);
3328       filter_workspace += 16;
3329       filter_reg_1_b = vld1q_s8(filter_workspace);
3330       filter_workspace += 16;
3331       filter_reg_2_a = vld1q_s8(filter_workspace);
3332       filter_workspace += 16;
3333       filter_reg_2_b = vld1q_s8(filter_workspace);
3334       filter_workspace += 16;
3335 
3336       filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
3337       filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
3338       filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
3339 
3340       if (block_height == 4) {
3341         for (int s = 0; s < 2; ++s) {
3342           // Work through one slice, by row, at a time.
3343           const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
3344           typename QuantizationTypeImpl<quantization_type>::ExternalType*
3345               output_data_base = output_data_depthwise + 4 * s;
3346 
3347           const int8* next_input_data = input_data_base;
3348           typename QuantizationTypeImpl<quantization_type>::ExternalType*
3349               output_data = output_data_base;
3350 
3351           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
3352           bias_data += kBiasIncrement;
3353 
3354           // Load first sub-micro block of data into operational banks.
3355           int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
3356           int8x16_t left_bank_1_reg =
3357               vld1q_s8(next_input_data + workspace_height_stride);
3358           int8x16_t left_bank_2_reg =
3359               vld1q_s8(next_input_data + 2 * workspace_height_stride);
3360           int8x16_t left_bank_3_reg =
3361               vld1q_s8(next_input_data + 3 * workspace_height_stride);
3362           int8x16_t left_bank_4_reg =
3363               vld1q_s8(next_input_data + 4 * workspace_height_stride);
3364           int8x16_t left_bank_5_reg =
3365               vld1q_s8(next_input_data + 5 * workspace_height_stride);
3366 
3367           int32x4_t acc0;
3368           int32x4_t acc1;
3369           int32x4_t acc2;
3370           int32x4_t acc3;
3371 
3372           acc0 = adjusted_bias_data;
3373           acc1 = adjusted_bias_data;
3374           acc2 = adjusted_bias_data;
3375           acc3 = adjusted_bias_data;
3376 
3377           acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3378           acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3379           acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3380           acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3381 
3382           for (int i_width = 0; i_width < output_width_micro_repeats;
3383                ++i_width) {
3384             next_input_data += width_micro_stride;
3385 
3386             // Iterate over input width shifts within 4x4 blocks.
3387             {
3388               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3389               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3390               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3391               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3392               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3393               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3394               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3395               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3396 
3397               // Fixed-point multiplication.
3398               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3399               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3400                   acc0, -output_shift);
3401               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3402               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3403                   acc1, -output_shift);
3404               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3405               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3406                   acc2, -output_shift);
3407               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3408               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3409                   acc3, -output_shift);
3410               // Add the output offset.
3411               int16x8_t acc_s16_0_1 =
3412                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3413               int16x8_t acc_s16_2_3 =
3414                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3415               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3416               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3417               // Apply the activation function.
3418               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3419                                                   vqmovxn_s16(acc_s16_2_3));
3420               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3421               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3422 
3423               vst1q_lane_8x4(output_data, acc_u8_all, 0);
3424               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3425               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3426                              2);
3427               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3428                              3);
3429 
3430               output_data += depth;
3431             }
3432 
3433             // Load next sub-micro block of data.
3434             int8x16_t right_bank_0_reg;
3435             int8x16_t right_bank_1_reg;
3436             int8x16_t right_bank_2_reg;
3437             int8x16_t right_bank_3_reg;
3438             int8x16_t right_bank_4_reg;
3439             int8x16_t right_bank_5_reg;
3440 
3441             // Loading of next block always valid.
3442             right_bank_0_reg = vld1q_s8(next_input_data);
3443             right_bank_1_reg =
3444                 vld1q_s8(next_input_data + workspace_height_stride);
3445             right_bank_2_reg =
3446                 vld1q_s8(next_input_data + 2 * workspace_height_stride);
3447             right_bank_3_reg =
3448                 vld1q_s8(next_input_data + 3 * workspace_height_stride);
3449             right_bank_4_reg =
3450                 vld1q_s8(next_input_data + 4 * workspace_height_stride);
3451             right_bank_5_reg =
3452                 vld1q_s8(next_input_data + 5 * workspace_height_stride);
3453 
3454             {
3455               acc0 = adjusted_bias_data;
3456               acc1 = adjusted_bias_data;
3457               acc2 = adjusted_bias_data;
3458               acc3 = adjusted_bias_data;
3459 
3460               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3461               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3462               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3463               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3464               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3465               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3466               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3467               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3468               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3469               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3470               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3471               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3472 
3473               // Fixed-point multiplication.
3474               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3475               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3476                   acc0, -output_shift);
3477               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3478               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3479                   acc1, -output_shift);
3480               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3481               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3482                   acc2, -output_shift);
3483               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3484               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3485                   acc3, -output_shift);
3486               // Add the output offset.
3487               int16x8_t acc_s16_0_1 =
3488                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3489               int16x8_t acc_s16_2_3 =
3490                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3491               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3492               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3493               // Apply the activation function.
3494               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3495                                                   vqmovxn_s16(acc_s16_2_3));
3496               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3497               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3498 
3499               vst1q_lane_8x4(output_data, acc_u8_all, 0);
3500               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3501               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3502                              2);
3503               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3504                              3);
3505 
3506               left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
3507               left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
3508               left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
3509               left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
3510               left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
3511               left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
3512               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
3513               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
3514               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
3515               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
3516               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
3517               vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
3518 
3519               output_data += depth;
3520             }
3521 
3522             {
3523               acc0 = adjusted_bias_data;
3524               acc1 = adjusted_bias_data;
3525               acc2 = adjusted_bias_data;
3526               acc3 = adjusted_bias_data;
3527 
3528               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3529               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3530               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3531               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3532               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3533               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3534               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3535               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3536               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3537               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3538               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3539               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3540 
3541               // Fixed-point multiplication.
3542               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3543               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3544                   acc0, -output_shift);
3545               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3546               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3547                   acc1, -output_shift);
3548               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3549               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3550                   acc2, -output_shift);
3551               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3552               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3553                   acc3, -output_shift);
3554               // Add the output offset.
3555               int16x8_t acc_s16_0_1 =
3556                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3557               int16x8_t acc_s16_2_3 =
3558                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3559               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3560               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3561               // Apply the activation function.
3562               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3563                                                   vqmovxn_s16(acc_s16_2_3));
3564               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3565               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3566 
3567               vst1q_lane_8x4(output_data, acc_u8_all, 0);
3568               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3569               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3570                              2);
3571               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3572                              3);
3573 
3574               output_data += depth;
3575             }
3576 
3577             {
3578               acc0 = adjusted_bias_data;
3579               acc1 = adjusted_bias_data;
3580               acc2 = adjusted_bias_data;
3581               acc3 = adjusted_bias_data;
3582 
3583               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
3584               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
3585               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
3586               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
3587               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
3588               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
3589               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
3590               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
3591               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
3592               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
3593               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
3594               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
3595 
3596               // Fixed-point multiplication.
3597               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3598               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3599                   acc0, -output_shift);
3600               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3601               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3602                   acc1, -output_shift);
3603               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3604               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3605                   acc2, -output_shift);
3606               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3607               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3608                   acc3, -output_shift);
3609               // Add the output offset.
3610               int16x8_t acc_s16_0_1 =
3611                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3612               int16x8_t acc_s16_2_3 =
3613                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3614               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3615               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3616               // Apply the activation function.
3617               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3618                                                   vqmovxn_s16(acc_s16_2_3));
3619               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3620               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3621 
3622               vst1q_lane_8x4(output_data, acc_u8_all, 0);
3623               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3624               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3625                              2);
3626               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3627                              3);
3628 
3629               left_bank_0_reg = right_bank_0_reg;
3630               left_bank_1_reg = right_bank_1_reg;
3631               left_bank_2_reg = right_bank_2_reg;
3632               left_bank_3_reg = right_bank_3_reg;
3633               left_bank_4_reg = right_bank_4_reg;
3634               left_bank_5_reg = right_bank_5_reg;
3635 
3636               output_data += depth;
3637               acc0 = adjusted_bias_data;
3638               acc1 = adjusted_bias_data;
3639               acc2 = adjusted_bias_data;
3640               acc3 = adjusted_bias_data;
3641 
3642               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3643               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3644               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3645               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3646             }
3647           }
3648 
3649           if (residual_width > 0) {
3650             next_input_data += width_micro_stride;
3651             const int output_width = residual_width;
3652 
3653             // Load next sub-micro block of data.
3654             int8x16_t right_bank_0_reg;
3655             int8x16_t right_bank_1_reg;
3656             int8x16_t right_bank_2_reg;
3657             int8x16_t right_bank_3_reg;
3658             int8x16_t right_bank_4_reg;
3659             int8x16_t right_bank_5_reg;
3660             // Logic: (output_width - 1) * stride_val < 2.
3661             const bool no_right_block = output_width < 3;
3662 
3663             if (no_right_block) {
3664               // Only needed for sanitizer checks.
3665               right_bank_0_reg = vdupq_n_s8(0);
3666               right_bank_1_reg = vdupq_n_s8(0);
3667               right_bank_2_reg = vdupq_n_s8(0);
3668               right_bank_3_reg = vdupq_n_s8(0);
3669               right_bank_4_reg = vdupq_n_s8(0);
3670               right_bank_5_reg = vdupq_n_s8(0);
3671             } else {
3672               right_bank_0_reg = vld1q_s8(next_input_data);
3673               right_bank_1_reg =
3674                   vld1q_s8(next_input_data + workspace_height_stride);
3675               right_bank_2_reg =
3676                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
3677               right_bank_3_reg =
3678                   vld1q_s8(next_input_data + 3 * workspace_height_stride);
3679               right_bank_4_reg =
3680                   vld1q_s8(next_input_data + 4 * workspace_height_stride);
3681               right_bank_5_reg =
3682                   vld1q_s8(next_input_data + 5 * workspace_height_stride);
3683             }
3684 
3685             // Iterate over input width shifts within 4x4 blocks.
3686             for (int x = 0; x < output_width; ++x) {
3687               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
3688               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
3689               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
3690               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
3691               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
3692               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
3693               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
3694               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
3695 
3696               // Fixed-point multiplication.
3697               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
3698               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3699                   acc0, -output_shift);
3700               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
3701               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3702                   acc1, -output_shift);
3703               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
3704               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3705                   acc2, -output_shift);
3706               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
3707               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3708                   acc3, -output_shift);
3709               // Add the output offset.
3710               int16x8_t acc_s16_0_1 =
3711                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
3712               int16x8_t acc_s16_2_3 =
3713                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
3714               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
3715               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
3716               // Apply the activation function.
3717               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
3718                                                   vqmovxn_s16(acc_s16_2_3));
3719               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
3720               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
3721 
3722               vst1q_lane_8x4(output_data, acc_u8_all, 0);
3723               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
3724               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
3725                              2);
3726               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3727                              3);
3728 
3729               biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
3730               biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
3731               biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
3732               biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
3733               biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
3734               biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
3735 
3736               output_data += depth;
3737 
3738               acc0 = adjusted_bias_data;
3739               acc1 = adjusted_bias_data;
3740               acc2 = adjusted_bias_data;
3741               acc3 = adjusted_bias_data;
3742 
3743               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
3744               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
3745               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
3746               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
3747             }
3748           }
3749           input_data_base += 4 * workspace_height_stride;
3750           output_data_base += 4 * output_height_stride;
3751 
3752           // Move to next sub-block: advance to second set of filters, to new
3753           // bias.
3754           filter_reg_0_a = filter_reg_0_b;
3755           filter_reg_1_a = filter_reg_1_b;
3756           filter_reg_2_a = filter_reg_2_b;
3757           filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
3758           filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
3759           filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
3760         }
3761       } else {
3762         const int8* input_data_base = input_data_depthwise;
3763         typename QuantizationTypeImpl<quantization_type>::ExternalType*
3764             output_data_base = output_data_depthwise;
3765 
3766         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
3767         bias_data += kBiasIncrement;
3768         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
3769         bias_data += kBiasIncrement;
3770 
3771         for (int k_height = 0; k_height < block_height; ++k_height) {
3772           const int8* next_input_data = input_data_base;
3773           typename QuantizationTypeImpl<quantization_type>::ExternalType*
3774               output_data = output_data_base;
3775 
3776           // Load first sub-micro block of data into operational banks.
3777           int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
3778           int8x16_t left_bank_1_reg_a =
3779               vld1q_s8(next_input_data + workspace_height_stride);
3780           int8x16_t left_bank_2_reg_a =
3781               vld1q_s8(next_input_data + 2 * workspace_height_stride);
3782           int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3783           int8x16_t left_bank_1_reg_b =
3784               vld1q_s8(next_input_data + workspace_height_stride + 16);
3785           int8x16_t left_bank_2_reg_b =
3786               vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3787 
3788           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
3789                ++i_width) {
3790             next_input_data += width_micro_stride;
3791             const int output_width =
3792                 i_width == output_width_micro_repeats ? residual_width : 4;
3793 
3794             int8x16_t right_bank_0_reg_a;
3795             int8x16_t right_bank_1_reg_a;
3796             int8x16_t right_bank_2_reg_a;
3797             int8x16_t right_bank_0_reg_b;
3798             int8x16_t right_bank_1_reg_b;
3799             int8x16_t right_bank_2_reg_b;
3800             // Logic: (output_width - 1) * stride_val < 2.
3801             const bool no_right_block = output_width < 3;
3802 
3803             // Load next sub-micro block of data.
3804             if (no_right_block) {
3805               // Only needed for sanitizer checks.
3806               right_bank_0_reg_a = vdupq_n_s8(0);
3807               right_bank_1_reg_a = vdupq_n_s8(0);
3808               right_bank_2_reg_a = vdupq_n_s8(0);
3809               right_bank_0_reg_b = vdupq_n_s8(0);
3810               right_bank_1_reg_b = vdupq_n_s8(0);
3811               right_bank_2_reg_b = vdupq_n_s8(0);
3812             } else {
3813               right_bank_0_reg_a = vld1q_s8(next_input_data);
3814               right_bank_1_reg_a =
3815                   vld1q_s8(next_input_data + workspace_height_stride);
3816               right_bank_2_reg_a =
3817                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
3818               right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
3819               right_bank_1_reg_b =
3820                   vld1q_s8(next_input_data + workspace_height_stride + 16);
3821               right_bank_2_reg_b =
3822                   vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
3823             }
3824 
3825             // Iterate over input width shifts within 4x4 blocks.
3826             for (int x = 0; x < output_width; ++x) {
3827               int32x4_t acc_a = adjusted_bias_data_a;
3828               int32x4_t acc_b = adjusted_bias_data_b;
3829               acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
3830               acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
3831               acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
3832               acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
3833               acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
3834               acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
3835 
3836               // Fixed-point multiplication.
3837               acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
3838               acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
3839               acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3840                   acc_a, -output_shift);
3841               acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
3842                   acc_b, -output_shift);
3843               // Add the output offset.
3844               int16x8_t acc_s16_0_0 =
3845                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
3846               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
3847               // Apply the activation function.
3848               uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
3849               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
3850                                         vget_low_u8(output_activation_min_vec));
3851               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
3852                                         vget_low_u8(output_activation_max_vec));
3853 
3854               util_vst1_x8(output_data, acc_u8_0_0);
3855 
3856               biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
3857               biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
3858               biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
3859               biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
3860               biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
3861               biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
3862 
3863               output_data += depth;
3864             }
3865           }
3866           input_data_base += workspace_height_stride;
3867           output_data_base += output_height_stride;
3868         }
3869       }
3870       input_data_depthwise += depth_micro_stride;
3871       output_data_depthwise += 8;
3872     }
3873   }  // NOLINT(readability/fn_size) Manually unrolled.
3874 
3875   static inline void Run(const int8* scratch_block_data,
3876                          const int8* filter_workspace, const int32* bias_data,
3877                          uint8* output_block_data,
3878                          const DepthwiseConvDotProdParams* function_params) {
3879     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
3880                                output_block_data, function_params);
3881   }
3882 };
3883 
3884 template <>
3885 struct KernelMacroBlock<
3886     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
3887     QuantizationType::kNonPerChannelUint8,
3888     DepthwiseConvDepthMultiplication::kNoMultiplication,
3889     /*stride=*/2> {
3890   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
3891   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
3892     return vmin_u8(a, b);
3893   }
3894   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
3895     return vmax_u8(a, b);
3896   }
3897 
3898   static inline void KernelMacroBlockIntrinsics(
3899       const int8* scratch_block_data, const int8* filter_workspace,
3900       const int32* bias_data, uint8* output_block_data,
3901       const DepthwiseConvDotProdParams* function_params) {
3902     static constexpr QuantizationType quantization_type =
3903         QuantizationType::kNonPerChannelUint8;
3904 
3905     const int workspace_height_stride =
3906         function_params->workspace_height_stride;
3907     const int input_width_overall_micro_repeats =
3908         function_params->input_width_overall_micro_repeats;
3909     const int output_width_micro_repeats =
3910         function_params->output_width_micro_repeats;
3911     const int depth_micro_repeats = function_params->depth_micro_repeats;
3912     const int depth = function_params->input_depth;
3913     constexpr int kStrideVal = 2;
3914     constexpr int kFourOverStride = 2;
3915     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
3916     TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
3917 
3918     const int workspace_width_micro_repeats =
3919         function_params->workspace_width_micro_repeats;
3920     const int output_width_overall_micro_repeats =
3921         function_params->output_width_overall_micro_repeats;
3922     const int block_height = function_params->outbound_block_height;
3923     const int residual_width = function_params->output_residual_width;
3924     const int output_height_stride = function_params->output_height_stride;
3925     constexpr int kBiasIncrement = 4;
3926 
3927     TFLITE_DCHECK(depth_micro_repeats > 0);
3928     const int width_micro_stride = 4 * 8;
3929     const int depth_micro_stride =
3930         width_micro_stride * input_width_overall_micro_repeats;
3931 
3932     const int32 output_activation_min =
3933         function_params->quantized_activation_min;
3934     const int32 output_activation_max =
3935         function_params->quantized_activation_max;
3936     const int32 output_multiplier = function_params->output_multiplier;
3937     const int32 output_shift = function_params->output_shift;
3938     const int32 output_offset = function_params->output_offset;
3939     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
3940       TFLITE_DCHECK_GE(output_activation_min, 0);
3941       TFLITE_DCHECK_LT(output_activation_min, 256);
3942       TFLITE_DCHECK_GE(output_activation_max, 0);
3943       TFLITE_DCHECK_LT(output_activation_max, 256);
3944     } else {
3945       TFLITE_DCHECK_GE(output_activation_min, -128);
3946       TFLITE_DCHECK_LT(output_activation_min, 128);
3947       TFLITE_DCHECK_GE(output_activation_max, -128);
3948       TFLITE_DCHECK_LT(output_activation_max, 128);
3949     }
3950     TFLITE_DCHECK_GE(output_offset, -32878);
3951     TFLITE_DCHECK_LT(output_offset, 32768);
3952 
3953     // This version only does min/max on 64 bits.
3954     const int16x8_t output_offset_vec =
3955         vdupq_n_s16(static_cast<int16>(output_offset));
3956     const uint8x8_t output_activation_min_vec =
3957         vdup_n_u8(static_cast<uint8>(output_activation_min));
3958     const uint8x8_t output_activation_max_vec =
3959         vdup_n_u8(static_cast<uint8>(output_activation_max));
3960 
3961     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
3962 
3963     TFLITE_DCHECK_LE(block_height, 2);
3964 
3965     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
3966       const int8* filter_block =
3967           filter_workspace + shuffled_filter_increment * j_depth;
3968 
3969       if (block_height == 2) {
3970         for (int s = 0; s < 2; ++s) {
3971           // Simulate NEON-register transposition of subset of filter.
3972           int8x16_t filter_reg_0_a;
3973           int8x16_t filter_reg_1_a;
3974           int8x16_t filter_reg_2_a;
3975 
3976           filter_reg_0_a = vld1q_s8(filter_block + s * 16);
3977           filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
3978           filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
3979 
3980           const int8* scratch_data =
3981               scratch_block_data + depth_micro_stride * j_depth;
3982           typename QuantizationTypeImpl<quantization_type>::ExternalType*
3983               output_data = output_block_data + 8 * j_depth;
3984           const int8* input_data_0 = scratch_data + s * 2 * 8;
3985 
3986           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
3987 
3988           // Load first sub-micro block of data into operational banks.
3989           int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
3990           int8x16_t left_bank_1_reg =
3991               vld1q_s8(input_data_0 + workspace_height_stride);
3992           int8x16_t left_bank_2_reg =
3993               vld1q_s8(input_data_0 + 2 * workspace_height_stride);
3994           int8x16_t left_bank_3_reg =
3995               vld1q_s8(input_data_0 + 3 * workspace_height_stride);
3996           int8x16_t left_bank_4_reg =
3997               vld1q_s8(input_data_0 + 4 * workspace_height_stride);
3998 
3999           int8x16_t right_bank_0_reg;
4000           int8x16_t right_bank_1_reg;
4001           int8x16_t right_bank_2_reg;
4002           int8x16_t right_bank_3_reg;
4003           int8x16_t right_bank_4_reg;
4004 
4005           int32x4_t acc0;
4006           int32x4_t acc1;
4007           int16x8_t acc_s16_0_1;
4008           uint8x8_t acc_u8;
4009 
4010           int i_width = 0;
4011 
4012           // When output_width_micro_repeats <
4013           // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
4014           // residual_width == 1 is then true iff residual_width < 2.
4015           const int adjusted_width_micro_repeats =
4016               (output_width_micro_repeats <
4017                output_width_overall_micro_repeats) &&
4018                       (residual_width == 1)
4019                   ? output_width_micro_repeats
4020                   : output_width_overall_micro_repeats;
4021 
4022           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4023             const int output_width = kFourOverStride;
4024             TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4025             const int8* input_data =
4026                 input_data_0 + width_micro_stride * i_width;
4027             acc0 = adjusted_bias_data;
4028             acc1 = adjusted_bias_data;
4029             right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
4030             right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
4031                                         workspace_height_stride);
4032 
4033             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4034             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4035             typename QuantizationTypeImpl<quantization_type>::ExternalType*
4036                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4037 
4038             right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
4039                                         2 * workspace_height_stride);
4040             right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
4041                                         3 * workspace_height_stride);
4042             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4043             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4044             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4045             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4046             right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
4047                                         4 * workspace_height_stride);
4048 
4049             // Fixed-point multiplication.
4050             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4051             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4052                 acc0, -output_shift);
4053             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4054             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4055                 acc1, -output_shift);
4056             // Add the output offset.
4057             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4058             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4059             // Apply the activation function.
4060             acc_u8 = vqmovxn_s16(acc_s16_0_1);
4061             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4062             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4063 
4064             left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
4065             left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
4066             left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
4067             left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
4068             left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
4069             acc0 = adjusted_bias_data;
4070             acc1 = adjusted_bias_data;
4071             vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4072             vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4073             vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4074             vst1_lane_8x4(output_data_base, acc_u8, 0);
4075             vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
4076 
4077             vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4078             vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4079 
4080             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4081             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4082             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4083             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4084             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4085             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4086 
4087             // Fixed-point multiplication.
4088             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4089             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4090                 acc0, -output_shift);
4091             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4092             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4093                 acc1, -output_shift);
4094             // Add the output offset.
4095             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4096             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4097             // Apply the activation function.
4098             acc_u8 = vqmovxn_s16(acc_s16_0_1);
4099             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4100             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4101 
4102             vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
4103             vst1_lane_8x4(output_data_base + depth + output_height_stride,
4104                           acc_u8, 1);
4105 
4106             left_bank_0_reg = right_bank_0_reg;
4107             left_bank_1_reg = right_bank_1_reg;
4108             left_bank_2_reg = right_bank_2_reg;
4109             left_bank_3_reg = right_bank_3_reg;
4110             left_bank_4_reg = right_bank_4_reg;
4111           }
4112           for (; i_width < output_width_overall_micro_repeats; ++i_width) {
4113             TFLITE_DCHECK_NE(residual_width, kFourOverStride);
4114 
4115             // No need to load next ("right") block of data.
4116 
4117             typename QuantizationTypeImpl<quantization_type>::ExternalType*
4118                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
4119 
4120             // Iterate over input width shifts within 4x4 blocks.
4121             {
4122               acc0 = adjusted_bias_data;
4123               acc1 = adjusted_bias_data;
4124 
4125               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
4126               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
4127               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
4128               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
4129               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
4130               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
4131 
4132               // Fixed-point multiplication.
4133               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4134               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4135                   acc0, -output_shift);
4136               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4137               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4138                   acc1, -output_shift);
4139               // Add the output offset.
4140               int16x8_t acc_s16_0_1 =
4141                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4142               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4143               // Apply the activation function.
4144               uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4145               acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4146               acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4147 
4148               vst1_lane_8x4(output_data_base, acc_u8, 0);
4149               vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
4150 
4151               left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
4152               left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
4153               left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
4154               left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
4155               left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
4156               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
4157               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
4158               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
4159               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
4160               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
4161             }
4162           }
4163           bias_data += kBiasIncrement;
4164         }
4165       } else {
4166         // block_height == 1.
4167         int8x16_t filter_reg_0_a;
4168         int8x16_t filter_reg_1_a;
4169         int8x16_t filter_reg_2_a;
4170         int8x16_t filter_reg_0_b;
4171         int8x16_t filter_reg_1_b;
4172         int8x16_t filter_reg_2_b;
4173 
4174         filter_reg_0_a = vld1q_s8(filter_block);
4175         filter_reg_1_a = vld1q_s8(filter_block + 32);
4176         filter_reg_2_a = vld1q_s8(filter_block + 64);
4177         filter_reg_0_b = vld1q_s8(filter_block + 16);
4178         filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
4179         filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
4180 
4181         const int8* scratch_data =
4182             scratch_block_data + depth_micro_stride * j_depth;
4183         typename QuantizationTypeImpl<quantization_type>::ExternalType*
4184             output_data = output_block_data + 8 * j_depth;
4185         const int8* input_data_0 = scratch_data;
4186 
4187         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4188         bias_data += kBiasIncrement;
4189         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4190         bias_data += kBiasIncrement;
4191 
4192         // Load first sub-micro block of data into operational banks.
4193         int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
4194         int8x16_t left_bank_1_reg_a =
4195             vld1q_s8(input_data_0 + workspace_height_stride);
4196         int8x16_t left_bank_2_reg_a =
4197             vld1q_s8(input_data_0 + 2 * workspace_height_stride);
4198         int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
4199         int8x16_t left_bank_1_reg_b =
4200             vld1q_s8(input_data_0 + workspace_height_stride + 16);
4201         int8x16_t left_bank_2_reg_b =
4202             vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
4203 
4204         int8x16_t right_bank_0_reg_a;
4205         int8x16_t right_bank_1_reg_a;
4206         int8x16_t right_bank_2_reg_a;
4207         int8x16_t right_bank_0_reg_b;
4208         int8x16_t right_bank_1_reg_b;
4209         int8x16_t right_bank_2_reg_b;
4210 
4211         int32x4_t acc0_a;
4212         int32x4_t acc0_b;
4213 
4214         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
4215              ++i_width) {
4216           const int output_width = i_width == output_width_micro_repeats
4217                                        ? residual_width
4218                                        : kFourOverStride;
4219           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
4220           const int8* input_data = input_data_0 + width_micro_stride * i_width;
4221           const bool no_right_block = i_width == output_width_micro_repeats &&
4222                                       output_width_overall_micro_repeats ==
4223                                           workspace_width_micro_repeats;
4224 
4225           if (!no_right_block) {
4226             // Load next sub-micro block of data.
4227             right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
4228             right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
4229                                           workspace_height_stride);
4230             right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
4231                                           2 * workspace_height_stride);
4232             right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
4233             right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
4234                                           workspace_height_stride + 16);
4235             right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
4236                                           2 * workspace_height_stride + 16);
4237           }
4238 
4239           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4240               output_data_base = output_data + depth * 2 * i_width;
4241 
4242           // Iterate over input width shifts within 4x4 blocks.
4243           {
4244             acc0_a = adjusted_bias_data_a;
4245             acc0_b = adjusted_bias_data_b;
4246 
4247             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4248             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4249             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4250             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4251             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4252             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4253 
4254             // Fixed-point multiplication.
4255             acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4256             acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4257             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4258                 acc0_a, -output_shift);
4259             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4260                 acc0_b, -output_shift);
4261             // Add the output offset.
4262             int16x8_t acc_s16_0_1 =
4263                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4264             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4265             // Apply the activation function.
4266             uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4267             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4268             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4269 
4270             util_vst1_x8(output_data_base, acc_u8);
4271 
4272             left_bank_0_reg_a = vrev32q_u16(left_bank_0_reg_a);
4273             left_bank_1_reg_a = vrev32q_u16(left_bank_1_reg_a);
4274             left_bank_2_reg_a = vrev32q_u16(left_bank_2_reg_a);
4275             left_bank_0_reg_b = vrev32q_u16(left_bank_0_reg_b);
4276             left_bank_1_reg_b = vrev32q_u16(left_bank_1_reg_b);
4277             left_bank_2_reg_b = vrev32q_u16(left_bank_2_reg_b);
4278             vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
4279             vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
4280             vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
4281             vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
4282             vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
4283             vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
4284           }
4285 
4286           if (output_width > 1) {
4287             acc0_a = adjusted_bias_data_a;
4288             acc0_b = adjusted_bias_data_b;
4289 
4290             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
4291             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
4292             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
4293             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
4294             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
4295             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
4296 
4297             // Fixed-point multiplication.
4298             acc0_a = vqrdmulhq_n_s32(acc0_a, output_multiplier);
4299             acc0_b = vqrdmulhq_n_s32(acc0_b, output_multiplier);
4300             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4301                 acc0_a, -output_shift);
4302             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4303                 acc0_b, -output_shift);
4304             // Add the output offset.
4305             int16x8_t acc_s16_0_1 =
4306                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
4307             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4308             // Apply the activation function.
4309             uint8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
4310             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
4311             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
4312 
4313             util_vst1_x8(output_data_base + depth, acc_u8);
4314 
4315             left_bank_0_reg_a = right_bank_0_reg_a;
4316             left_bank_1_reg_a = right_bank_1_reg_a;
4317             left_bank_2_reg_a = right_bank_2_reg_a;
4318             left_bank_0_reg_b = right_bank_0_reg_b;
4319             left_bank_1_reg_b = right_bank_1_reg_b;
4320             left_bank_2_reg_b = right_bank_2_reg_b;
4321           }
4322         }
4323       }
4324     }
4325   }  // NOLINT(readability/fn_size) Manually unrolled.
4326 
4327   static inline void Run(const int8* scratch_block_data,
4328                          const int8* filter_workspace, const int32* bias_data,
4329                          uint8* output_block_data,
4330                          const DepthwiseConvDotProdParams* function_params) {
4331     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
4332                                output_block_data, function_params);
4333   }
4334 };
4335 
4336 template <>
4337 struct KernelMacroBlock<
4338     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
4339     QuantizationType::kNonPerChannelUint8,
4340     DepthwiseConvDepthMultiplication::kUnitInputDepth,
4341     /*stride=*/1> {
4342   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
4343   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
4344     return vmin_u8(a, b);
4345   }
4346   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
4347     return vmax_u8(a, b);
4348   }
4349   static inline uint8x16_t util_vminq_x8(uint8x16_t a, uint8x16_t b) {
4350     return vminq_u8(a, b);
4351   }
4352   static inline uint8x16_t util_vmaxq_x8(uint8x16_t a, uint8x16_t b) {
4353     return vmaxq_u8(a, b);
4354   }
4355 
4356   static inline void KernelMacroBlockIntrinsics(
4357       const int8* scratch_block_data, const int8* filter_workspace,
4358       const int32* bias_data, uint8* output_block_data,
4359       const DepthwiseConvDotProdParams* function_params) {
4360     static constexpr QuantizationType quantization_type =
4361         QuantizationType::kNonPerChannelUint8;
4362 
4363     TFLITE_DCHECK_EQ(function_params->stride, 1);
4364     const int workspace_height_stride =
4365         function_params->workspace_height_stride;
4366     const int output_width_micro_repeats =
4367         function_params->output_width_micro_repeats;
4368     const int depth_micro_repeats = function_params->depth_micro_repeats;
4369     const int output_depth = function_params->output_depth;
4370 
4371     const int output_width_overall_micro_repeats =
4372         function_params->output_width_overall_micro_repeats;
4373     const int block_height = function_params->outbound_block_height;
4374     const int residual_width = function_params->output_residual_width;
4375     const int output_height_stride = function_params->output_height_stride;
4376     constexpr int kBiasIncrement = 4;
4377 
4378     TFLITE_DCHECK(depth_micro_repeats > 0);
4379 
4380     const int32 output_activation_min =
4381         function_params->quantized_activation_min;
4382     const int32 output_activation_max =
4383         function_params->quantized_activation_max;
4384     const int32 output_multiplier = function_params->output_multiplier;
4385     const int32 output_shift = function_params->output_shift;
4386     const int32 output_offset = function_params->output_offset;
4387     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
4388       TFLITE_DCHECK_GE(output_activation_min, 0);
4389       TFLITE_DCHECK_LT(output_activation_min, 256);
4390       TFLITE_DCHECK_GE(output_activation_max, 0);
4391       TFLITE_DCHECK_LT(output_activation_max, 256);
4392     } else {
4393       TFLITE_DCHECK_GE(output_activation_min, -128);
4394       TFLITE_DCHECK_LT(output_activation_min, 128);
4395       TFLITE_DCHECK_GE(output_activation_max, -128);
4396       TFLITE_DCHECK_LT(output_activation_max, 128);
4397     }
4398     TFLITE_DCHECK_GE(output_offset, -32878);
4399     TFLITE_DCHECK_LT(output_offset, 32768);
4400 
4401     const int16x8_t output_offset_vec =
4402         vdupq_n_s16(static_cast<int16>(output_offset));
4403     const uint8x16_t output_activation_min_vec =
4404         vdupq_n_u8(static_cast<uint8>(output_activation_min));
4405     const uint8x16_t output_activation_max_vec =
4406         vdupq_n_u8(static_cast<uint8>(output_activation_max));
4407 
4408     typename QuantizationTypeImpl<quantization_type>::ExternalType*
4409         output_data_depthwise = output_block_data;
4410     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
4411       // Simulate NEON-register transposition of subset of filter.
4412       int8x16_t filter_reg_0_a;
4413       int8x16_t filter_reg_0_b;
4414       int8x16_t filter_reg_1_a;
4415       int8x16_t filter_reg_1_b;
4416       int8x16_t filter_reg_2_a;
4417       int8x16_t filter_reg_2_b;
4418       int8x16_t filter_reg_0_a_shifted;
4419       int8x16_t filter_reg_1_a_shifted;
4420       int8x16_t filter_reg_2_a_shifted;
4421 
4422       filter_reg_0_a = vld1q_s8(filter_workspace);
4423       filter_workspace += 16;
4424       filter_reg_0_b = vld1q_s8(filter_workspace);
4425       filter_workspace += 16;
4426       filter_reg_1_a = vld1q_s8(filter_workspace);
4427       filter_workspace += 16;
4428       filter_reg_1_b = vld1q_s8(filter_workspace);
4429       filter_workspace += 16;
4430       filter_reg_2_a = vld1q_s8(filter_workspace);
4431       filter_workspace += 16;
4432       filter_reg_2_b = vld1q_s8(filter_workspace);
4433       filter_workspace += 16;
4434 
4435       filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
4436       filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
4437       filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
4438 
4439       // When output_width_micro_repeats < output_width_overall_micro_repeats,
4440       // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
4441       // residual_width < 2.
4442       const int adjusted_width_micro_repeats =
4443           (output_width_micro_repeats < output_width_overall_micro_repeats) &&
4444                   (residual_width < 4)
4445               ? output_width_micro_repeats
4446               : output_width_overall_micro_repeats;
4447 
4448       if (block_height == 4) {
4449         for (int s = 0; s < 2; ++s) {
4450           // Work through one slice, by row, at a time.
4451           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4452               output_data_base = output_data_depthwise + 4 * s;
4453 
4454           const int8* next_input_data = scratch_block_data;
4455           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4456               output_data = output_data_base;
4457 
4458           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
4459           bias_data += kBiasIncrement;
4460 
4461           int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
4462           int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
4463           int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
4464 
4465           // Load first sub-micro block of data into operational banks.
4466           input_bank_a_reg =
4467               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
4468                                                 // uninitialized variable.
4469           input_bank_a_reg = vld1q_lane_8x4(
4470               next_input_data + workspace_height_stride, input_bank_a_reg, 2);
4471           input_bank_b_reg = vld1q_dup_s8x4(
4472               next_input_data +
4473               2 * workspace_height_stride);  // Load lane 0, avoiding
4474                                              // uninitialized variable.
4475           input_bank_b_reg =
4476               vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4477                              input_bank_b_reg, 2);
4478           input_bank_c_reg = vld1q_dup_s8x4(
4479               next_input_data +
4480               4 * workspace_height_stride);  // Load lane 0, avoiding
4481                                              // uninitialized variable.
4482           input_bank_c_reg =
4483               vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4484                              input_bank_c_reg, 2);
4485 
4486           int32x4_t acc0;
4487           int32x4_t acc1;
4488           int32x4_t acc2;
4489           int32x4_t acc3;
4490 
4491           acc0 = adjusted_bias_data;
4492           acc1 = adjusted_bias_data;
4493           acc2 = adjusted_bias_data;
4494           acc3 = adjusted_bias_data;
4495 
4496           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
4497           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
4498           acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
4499           acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
4500 
4501           int i_width = 0;
4502           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
4503             next_input_data += 4;
4504 
4505             // Iterate over input width shifts within 4x4 blocks.
4506             {
4507               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4508                                          0);
4509               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4510                                          2);
4511               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4512                                          2);
4513               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4514                                          2);
4515               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4516                                          2);
4517               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4518                                          0);
4519               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4520                                          0);
4521               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4522                                          2);
4523 
4524               // Fixed-point multiplication.
4525               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4526               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4527                   acc0, -output_shift);
4528               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4529               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4530                   acc1, -output_shift);
4531               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4532               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4533                   acc2, -output_shift);
4534               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4535               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4536                   acc3, -output_shift);
4537               // Add the output offset.
4538               int16x8_t acc_s16_0_1 =
4539                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4540               int16x8_t acc_s16_2_3 =
4541                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4542               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4543               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4544               // Apply the activation function.
4545               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4546                                                   vqmovxn_s16(acc_s16_2_3));
4547               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4548               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4549 
4550               vst1q_lane_8x4(output_data, acc_u8_all, 0);
4551               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4552               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4553                              2);
4554               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4555                              3);
4556 
4557               output_data += output_depth;
4558             }
4559             // Load next sub-micro block of data.
4560             input_bank_a_reg =
4561                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4562             input_bank_a_reg = vld1q_lane_8x4(
4563                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4564             input_bank_b_reg =
4565                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4566                                input_bank_b_reg, 1);
4567             input_bank_b_reg =
4568                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4569                                input_bank_b_reg, 3);
4570             input_bank_c_reg =
4571                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4572                                input_bank_c_reg, 1);
4573             input_bank_c_reg =
4574                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4575                                input_bank_c_reg, 3);
4576 
4577             {
4578               acc0 = adjusted_bias_data;
4579               acc1 = adjusted_bias_data;
4580               acc2 = adjusted_bias_data;
4581               acc3 = adjusted_bias_data;
4582 
4583               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4584                                          input_bank_a_reg, 0);
4585               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4586                                          input_bank_a_reg, 2);
4587               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4588                                          input_bank_b_reg, 0);
4589               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4590                                          input_bank_a_reg, 2);
4591               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4592                                          input_bank_b_reg, 0);
4593               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4594                                          input_bank_b_reg, 2);
4595               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4596                                          input_bank_b_reg, 0);
4597               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4598                                          input_bank_b_reg, 2);
4599               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4600                                          input_bank_c_reg, 0);
4601               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4602                                          input_bank_b_reg, 2);
4603               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4604                                          input_bank_c_reg, 0);
4605               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4606                                          input_bank_c_reg, 2);
4607 
4608               // Fixed-point multiplication.
4609               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4610               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4611                   acc0, -output_shift);
4612               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4613               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4614                   acc1, -output_shift);
4615               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4616               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4617                   acc2, -output_shift);
4618               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4619               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4620                   acc3, -output_shift);
4621               // Add the output offset.
4622               int16x8_t acc_s16_0_1 =
4623                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4624               int16x8_t acc_s16_2_3 =
4625                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4626               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4627               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4628               // Apply the activation function.
4629               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4630                                                   vqmovxn_s16(acc_s16_2_3));
4631               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4632               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4633 
4634               vst1q_lane_8x4(output_data, acc_u8_all, 0);
4635               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4636               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4637                              2);
4638               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4639                              3);
4640 
4641               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
4642               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
4643               input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
4644 
4645               output_data += output_depth;
4646             }
4647 
4648             {
4649               acc0 = adjusted_bias_data;
4650               acc1 = adjusted_bias_data;
4651               acc2 = adjusted_bias_data;
4652               acc3 = adjusted_bias_data;
4653 
4654               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4655                                          0);
4656               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4657                                          2);
4658               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4659                                          0);
4660               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4661                                          2);
4662               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4663                                          0);
4664               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4665                                          2);
4666               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4667                                          0);
4668               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4669                                          2);
4670               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4671                                          0);
4672               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4673                                          2);
4674               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4675                                          0);
4676               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4677                                          2);
4678 
4679               // Fixed-point multiplication.
4680               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4681               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4682                   acc0, -output_shift);
4683               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4684               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4685                   acc1, -output_shift);
4686               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4687               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4688                   acc2, -output_shift);
4689               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4690               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4691                   acc3, -output_shift);
4692               // Add the output offset.
4693               int16x8_t acc_s16_0_1 =
4694                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4695               int16x8_t acc_s16_2_3 =
4696                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4697               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4698               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4699               // Apply the activation function.
4700               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4701                                                   vqmovxn_s16(acc_s16_2_3));
4702               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4703               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4704 
4705               vst1q_lane_8x4(output_data, acc_u8_all, 0);
4706               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4707               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4708                              2);
4709               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4710                              3);
4711 
4712               output_data += output_depth;
4713             }
4714 
4715             {
4716               acc0 = adjusted_bias_data;
4717               acc1 = adjusted_bias_data;
4718               acc2 = adjusted_bias_data;
4719               acc3 = adjusted_bias_data;
4720 
4721               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
4722                                          input_bank_a_reg, 0);
4723               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
4724                                          input_bank_a_reg, 2);
4725               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
4726                                          input_bank_b_reg, 0);
4727               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
4728                                          input_bank_a_reg, 2);
4729               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
4730                                          input_bank_b_reg, 0);
4731               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
4732                                          input_bank_b_reg, 2);
4733               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
4734                                          input_bank_b_reg, 0);
4735               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
4736                                          input_bank_b_reg, 2);
4737               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
4738                                          input_bank_c_reg, 0);
4739               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
4740                                          input_bank_b_reg, 2);
4741               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
4742                                          input_bank_c_reg, 0);
4743               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
4744                                          input_bank_c_reg, 2);
4745 
4746               // Fixed-point multiplication.
4747               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4748               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4749                   acc0, -output_shift);
4750               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4751               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4752                   acc1, -output_shift);
4753               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4754               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4755                   acc2, -output_shift);
4756               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4757               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4758                   acc3, -output_shift);
4759               // Add the output offset.
4760               int16x8_t acc_s16_0_1 =
4761                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4762               int16x8_t acc_s16_2_3 =
4763                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4764               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4765               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4766               // Apply the activation function.
4767               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4768                                                   vqmovxn_s16(acc_s16_2_3));
4769               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4770               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4771 
4772               vst1q_lane_8x4(output_data, acc_u8_all, 0);
4773               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4774               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4775                              2);
4776               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4777                              3);
4778 
4779               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
4780               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
4781               input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
4782 
4783               output_data += output_depth;
4784               acc0 = adjusted_bias_data;
4785               acc1 = adjusted_bias_data;
4786               acc2 = adjusted_bias_data;
4787               acc3 = adjusted_bias_data;
4788 
4789               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4790                                          0);
4791               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4792                                          0);
4793               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4794                                          0);
4795               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4796                                          2);
4797             }
4798           }
4799 
4800           if (i_width < output_width_overall_micro_repeats) {
4801             next_input_data += 4;
4802             const int output_width = residual_width;
4803 
4804             // Load next sub-micro block of data.
4805             input_bank_a_reg =
4806                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
4807             input_bank_a_reg = vld1q_lane_8x4(
4808                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
4809             input_bank_b_reg =
4810                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4811                                input_bank_b_reg, 1);
4812             input_bank_b_reg =
4813                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
4814                                input_bank_b_reg, 3);
4815             input_bank_c_reg =
4816                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
4817                                input_bank_c_reg, 1);
4818             input_bank_c_reg =
4819                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
4820                                input_bank_c_reg, 3);
4821 
4822             // Iterate over input width shifts within 4x4 blocks.
4823             for (int x = 0; x < output_width; ++x) {
4824               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
4825                                          0);
4826               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
4827                                          2);
4828               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
4829                                          2);
4830               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
4831                                          2);
4832               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
4833                                          2);
4834               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
4835                                          0);
4836               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
4837                                          0);
4838               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
4839                                          2);
4840 
4841               // Fixed-point multiplication.
4842               acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
4843               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4844                   acc0, -output_shift);
4845               acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
4846               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4847                   acc1, -output_shift);
4848               acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
4849               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4850                   acc2, -output_shift);
4851               acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
4852               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4853                   acc3, -output_shift);
4854               // Add the output offset.
4855               int16x8_t acc_s16_0_1 =
4856                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
4857               int16x8_t acc_s16_2_3 =
4858                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
4859               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
4860               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
4861               // Apply the activation function.
4862               uint8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
4863                                                   vqmovxn_s16(acc_s16_2_3));
4864               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
4865               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
4866 
4867               vst1q_lane_8x4(output_data, acc_u8_all, 0);
4868               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
4869               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
4870                              2);
4871               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
4872                              3);
4873 
4874               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
4875               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
4876               input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);
4877 
4878               output_data += output_depth;
4879 
4880               acc0 = adjusted_bias_data;
4881               acc1 = adjusted_bias_data;
4882               acc2 = adjusted_bias_data;
4883               acc3 = adjusted_bias_data;
4884 
4885               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
4886                                          0);
4887               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
4888                                          0);
4889               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
4890                                          0);
4891               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
4892                                          2);
4893             }
4894           }
4895           // scratch_block_data += 4 * workspace_height_stride;
4896           output_data_base += 4 * output_height_stride;
4897 
4898           // Move to next sub-block: advance to second set of filters, to new
4899           // bias.
4900           filter_reg_0_a = filter_reg_0_b;
4901           filter_reg_1_a = filter_reg_1_b;
4902           filter_reg_2_a = filter_reg_2_b;
4903           filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
4904           filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
4905           filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
4906         }
4907       } else {
4908         // Block height < 4.
4909         typename QuantizationTypeImpl<quantization_type>::ExternalType*
4910             output_data_base = output_data_depthwise;
4911 
4912         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
4913         bias_data += kBiasIncrement;
4914         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
4915         bias_data += kBiasIncrement;
4916 
4917         for (int k_height = 0; k_height < block_height; ++k_height) {
4918           const int8* next_input_data =
4919               scratch_block_data + k_height * workspace_height_stride;
4920           typename QuantizationTypeImpl<quantization_type>::ExternalType*
4921               output_data = output_data_base;
4922 
4923           int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
4924           int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.
4925 
4926           // Load first sub-micro block of data into operational banks.
4927           input_bank_p_reg =
4928               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
4929                                                 // uninitialized variable.
4930           input_bank_p_reg = vld1q_lane_8x4(
4931               next_input_data + workspace_height_stride, input_bank_p_reg, 2);
4932           input_bank_q_reg = vld1q_dup_s8x4(
4933               next_input_data +
4934               2 * workspace_height_stride);  // Load lane 0, avoiding
4935                                              // uninitialized variable.
4936 
4937           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
4938                ++i_width) {
4939             next_input_data += 4;
4940             const int output_width =
4941                 i_width == output_width_micro_repeats ? residual_width : 4;
4942 
4943             // Load next sub-micro block of data.
4944             input_bank_p_reg =
4945                 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
4946             input_bank_p_reg = vld1q_lane_8x4(
4947                 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
4948             input_bank_q_reg =
4949                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
4950                                input_bank_q_reg, 1);
4951             // Iterate over input width shifts within 4x4 blocks.
4952             for (int x = 0; x < output_width; ++x) {
4953               int32x4_t acc_a = adjusted_bias_data_a;
4954               int32x4_t acc_b = adjusted_bias_data_b;
4955               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
4956                                           input_bank_p_reg, 0);
4957               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
4958                                           input_bank_p_reg, 2);
4959               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
4960                                           input_bank_q_reg, 0);
4961               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
4962                                           input_bank_p_reg, 0);
4963               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
4964                                           input_bank_p_reg, 2);
4965               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
4966                                           input_bank_q_reg, 0);
4967 
4968               // Fixed-point multiplication.
4969               acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
4970               acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
4971               acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4972                   acc_a, -output_shift);
4973               acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
4974                   acc_b, -output_shift);
4975               // Add the output offset.
4976               int16x8_t acc_s16_0_0 =
4977                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
4978               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
4979               // Apply the activation function.
4980               uint8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
4981               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
4982                                         vget_low_u8(output_activation_min_vec));
4983               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
4984                                         vget_low_u8(output_activation_max_vec));
4985 
4986               util_vst1_x8(output_data, acc_u8_0_0);
4987 
4988               input_bank_p_reg = vshrq_n_u64(input_bank_p_reg, 8);
4989               input_bank_q_reg = vshrq_n_u64(input_bank_q_reg, 8);
4990 
4991               output_data += output_depth;
4992             }
4993           }
4994           output_data_base += output_height_stride;
4995         }
4996       }
4997       output_data_depthwise += 8;
4998     }
4999   }  // NOLINT(readability/fn_size) Manually unrolled.
5000 
5001   static inline void Run(const int8* scratch_block_data,
5002                          const int8* filter_workspace, const int32* bias_data,
5003                          uint8* output_block_data,
5004                          const DepthwiseConvDotProdParams* function_params) {
5005     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5006                                output_block_data, function_params);
5007   }
5008 };
5009 
5010 template <>
5011 struct KernelMacroBlock<
5012     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5013     QuantizationType::kNonPerChannelUint8,
5014     DepthwiseConvDepthMultiplication::kUnitInputDepth,
5015     /*stride=*/2> {
5016   static inline uint8x8_t vqmovxn_s16(int16x8_t x) { return vqmovun_s16(x); }
5017   static inline uint8x8_t util_vmin_x8(uint8x8_t a, uint8x8_t b) {
5018     return vmin_u8(a, b);
5019   }
5020   static inline uint8x8_t util_vmax_x8(uint8x8_t a, uint8x8_t b) {
5021     return vmax_u8(a, b);
5022   }
5023 
5024   static inline void KernelMacroBlockIntrinsics(
5025       const int8* scratch_block_data, const int8* filter_workspace,
5026       const int32* bias_data, uint8* output_block_data,
5027       const DepthwiseConvDotProdParams* function_params) {
5028     static constexpr QuantizationType quantization_type =
5029         QuantizationType::kNonPerChannelUint8;
5030 
5031     const int workspace_height_stride =
5032         function_params->workspace_height_stride;
5033     const int output_width_micro_repeats =
5034         function_params->output_width_micro_repeats;
5035     const int depth_micro_repeats = function_params->depth_micro_repeats;
5036     const int output_depth = function_params->output_depth;
5037     constexpr int kStrideVal = 2;
5038     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
5039 
5040     const int output_width_overall_micro_repeats =
5041         function_params->output_width_overall_micro_repeats;
5042     const int block_height = function_params->outbound_block_height;
5043     const int residual_width = function_params->output_residual_width;
5044     const int output_height_stride = function_params->output_height_stride;
5045     constexpr int kBiasIncrement = 4;
5046 
5047     const int32 output_activation_min =
5048         function_params->quantized_activation_min;
5049     const int32 output_activation_max =
5050         function_params->quantized_activation_max;
5051     const int32 output_multiplier = function_params->output_multiplier;
5052     const int32 output_shift = function_params->output_shift;
5053     const int32 output_offset = function_params->output_offset;
5054     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5055       TFLITE_DCHECK_GE(output_activation_min, 0);
5056       TFLITE_DCHECK_LT(output_activation_min, 256);
5057       TFLITE_DCHECK_GE(output_activation_max, 0);
5058       TFLITE_DCHECK_LT(output_activation_max, 256);
5059     } else {
5060       TFLITE_DCHECK_GE(output_activation_min, -128);
5061       TFLITE_DCHECK_LT(output_activation_min, 128);
5062       TFLITE_DCHECK_GE(output_activation_max, -128);
5063       TFLITE_DCHECK_LT(output_activation_max, 128);
5064     }
5065     TFLITE_DCHECK_GE(output_offset, -32878);
5066     TFLITE_DCHECK_LT(output_offset, 32768);
5067 
5068     TFLITE_DCHECK_GE(depth_micro_repeats, 1);
5069 
5070     const int16x8_t output_offset_vec =
5071         vdupq_n_s16(static_cast<int16>(output_offset));
5072     const uint8x16_t output_activation_min_vec =
5073         vdupq_n_u8(static_cast<uint8>(output_activation_min));
5074     const uint8x16_t output_activation_max_vec =
5075         vdupq_n_u8(static_cast<uint8>(output_activation_max));
5076 
5077     for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
5078       int8x16_t filter_reg_0_a;
5079       int8x16_t filter_reg_0_b;
5080       int8x16_t filter_reg_1_a;
5081       int8x16_t filter_reg_1_b;
5082       int8x16_t filter_reg_2_a;
5083       int8x16_t filter_reg_2_b;
5084 
5085       filter_reg_0_a = vld1q_s8(filter_workspace);
5086       filter_workspace += 16;
5087       filter_reg_0_b = vld1q_s8(filter_workspace);
5088       filter_workspace += 16;
5089       filter_reg_1_a = vld1q_s8(filter_workspace);
5090       filter_workspace += 16;
5091       filter_reg_1_b = vld1q_s8(filter_workspace);
5092       filter_workspace += 16;
5093       filter_reg_2_a = vld1q_s8(filter_workspace);
5094       filter_workspace += 16;
5095       filter_reg_2_b = vld1q_s8(filter_workspace);
5096       filter_workspace += 16;
5097 
5098       const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
5099       bias_data += kBiasIncrement;
5100       const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
5101       bias_data += kBiasIncrement;
5102 
5103       if (block_height == 2) {
5104         const int8* scratch_data = scratch_block_data;
5105         typename QuantizationTypeImpl<quantization_type>::ExternalType*
5106             output_data = output_block_data + 8 * j_depth;
5107 
5108         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
5109         int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
5110         int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
5111 
5112         // Load first sub-micro block of data into operational banks.
5113         input_bank_a_reg =
5114             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
5115                                            // uninitialized variable.
5116         input_bank_a_reg = vld1q_lane_8x4(
5117             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5118         input_bank_b_reg = vld1q_dup_s8x4(
5119             scratch_data +
5120             2 * workspace_height_stride);  // Load lane 0, avoiding
5121                                            // uninitialized variable.
5122         input_bank_b_reg = vld1q_lane_8x4(
5123             scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
5124         input_bank_c_reg = vld1q_dup_s8x4(
5125             scratch_data +
5126             4 * workspace_height_stride);  // Load lane 0, avoiding
5127                                            // uninitialized variable.
5128 
5129         int32x4_t acc0;
5130         int32x4_t acc1;
5131 
5132         // When output_width_micro_repeats < output_width_overall_micro_repeats,
5133         // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
5134         // residual_width < 2.
5135         const int adjusted_width_micro_repeats =
5136             (output_width_micro_repeats < output_width_overall_micro_repeats) &&
5137                     (residual_width < 2)
5138                 ? output_width_micro_repeats
5139                 : output_width_overall_micro_repeats;
5140 
5141         int i_width = 0;
5142         for (; i_width < adjusted_width_micro_repeats; ++i_width) {
5143           const int8* input_data = scratch_data + 4 + 4 * i_width;
5144 
5145           // Load next sub-micro block of data.
5146           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5147           input_bank_a_reg = vld1q_lane_8x4(
5148               input_data + workspace_height_stride, input_bank_a_reg, 3);
5149           input_bank_b_reg = vld1q_lane_8x4(
5150               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5151           input_bank_b_reg = vld1q_lane_8x4(
5152               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5153           input_bank_c_reg = vld1q_lane_8x4(
5154               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5155 
5156           int16x8_t acc_s16_0_1;
5157           uint8x8_t acc_u8_0_1;
5158           // Iterate over input width shifts within 4x4 blocks.
5159           {
5160             acc0 = adjusted_bias_data_s_0;
5161             acc1 = adjusted_bias_data_s_0;
5162 
5163             acc0 =
5164                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5165             acc0 =
5166                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5167             acc0 =
5168                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5169             acc1 =
5170                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5171             acc1 =
5172                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5173             acc1 =
5174                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5175 
5176             // Fixed-point multiplication.
5177             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5178             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5179                 acc0, -output_shift);
5180             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5181             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5182                 acc1, -output_shift);
5183             // Add the output offset.
5184             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5185             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5186             // Apply the activation function.
5187             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5188             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5189                                       vget_low_u8(output_activation_min_vec));
5190             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5191                                       vget_low_u8(output_activation_max_vec));
5192 
5193             vst1_lane_8x4(output_data, acc_u8_0_1, 0);
5194             vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5195 
5196             acc0 = adjusted_bias_data_s_1;
5197             acc1 = adjusted_bias_data_s_1;
5198 
5199             acc0 =
5200                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5201             acc0 =
5202                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5203             acc0 =
5204                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5205             acc1 =
5206                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5207             acc1 =
5208                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5209             acc1 =
5210                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5211 
5212             // Fixed-point multiplication.
5213             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5214             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5215                 acc0, -output_shift);
5216             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5217             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5218                 acc1, -output_shift);
5219             // Add the output offset.
5220             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5221             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5222             // Apply the activation function.
5223             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5224             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5225                                       vget_low_u8(output_activation_min_vec));
5226             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5227                                       vget_low_u8(output_activation_max_vec));
5228 
5229             vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
5230             vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5231                           1);
5232 
5233             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5234             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5235             input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
5236 
5237             output_data += output_depth;
5238           }
5239 
5240           // output_width == four_over_stride.
5241           acc0 = adjusted_bias_data_s_0;
5242           acc1 = adjusted_bias_data_s_0;
5243 
5244           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5245           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5246           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5247           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5248           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5249           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5250 
5251           // Fixed-point multiplication.
5252           acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5253           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5254               acc0, -output_shift);
5255           acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5256           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5257               acc1, -output_shift);
5258           // Add the output offset.
5259           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5260           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5261           // Apply the activation function.
5262           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5263           acc_u8_0_1 =
5264               util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5265           acc_u8_0_1 =
5266               util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5267 
5268           vst1_lane_8x4(output_data, acc_u8_0_1, 0);
5269           vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5270 
5271           acc0 = adjusted_bias_data_s_1;
5272           acc1 = adjusted_bias_data_s_1;
5273 
5274           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5275           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5276           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5277           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5278           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5279           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5280 
5281           // Fixed-point multiplication.
5282           acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5283           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5284               acc0, -output_shift);
5285           acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5286           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5287               acc1, -output_shift);
5288           // Add the output offset.
5289           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5290           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5291           // Apply the activation function.
5292           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5293           acc_u8_0_1 =
5294               util_vmax_x8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
5295           acc_u8_0_1 =
5296               util_vmin_x8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
5297 
5298           vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
5299           vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
5300 
5301           input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5302           input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5303           input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
5304 
5305           output_data += output_depth;
5306         }
5307         for (; i_width < output_width_overall_micro_repeats; ++i_width) {
5308           // output_width == 1.
5309           const int8* input_data = scratch_data + 4 + 4 * i_width;
5310 
5311           // Load next sub-micro block of data.
5312           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5313           input_bank_a_reg = vld1q_lane_8x4(
5314               input_data + workspace_height_stride, input_bank_a_reg, 3);
5315           input_bank_b_reg = vld1q_lane_8x4(
5316               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5317           input_bank_b_reg = vld1q_lane_8x4(
5318               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
5319           input_bank_c_reg = vld1q_lane_8x4(
5320               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
5321 
5322           int16x8_t acc_s16_0_1;
5323           uint8x8_t acc_u8_0_1;
5324           // Iterate over input width shifts within 4x4 blocks.
5325           {
5326             acc0 = adjusted_bias_data_s_0;
5327             acc1 = adjusted_bias_data_s_0;
5328 
5329             acc0 =
5330                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5331             acc0 =
5332                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5333             acc0 =
5334                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5335             acc1 =
5336                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
5337             acc1 =
5338                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
5339             acc1 =
5340                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
5341 
5342             // Fixed-point multiplication.
5343             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5344             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5345                 acc0, -output_shift);
5346             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5347             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5348                 acc1, -output_shift);
5349             // Add the output offset.
5350             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5351             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5352             // Apply the activation function.
5353             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5354             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5355                                       vget_low_u8(output_activation_min_vec));
5356             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5357                                       vget_low_u8(output_activation_max_vec));
5358 
5359             vst1_lane_8x4(output_data, acc_u8_0_1, 0);
5360             vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
5361 
5362             acc0 = adjusted_bias_data_s_1;
5363             acc1 = adjusted_bias_data_s_1;
5364 
5365             acc0 =
5366                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
5367             acc0 =
5368                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
5369             acc0 =
5370                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
5371             acc1 =
5372                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
5373             acc1 =
5374                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
5375             acc1 =
5376                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
5377 
5378             // Fixed-point multiplication.
5379             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5380             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5381                 acc0, -output_shift);
5382             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5383             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5384                 acc1, -output_shift);
5385             // Add the output offset.
5386             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5387             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5388             // Apply the activation function.
5389             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5390             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5391                                       vget_low_u8(output_activation_min_vec));
5392             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5393                                       vget_low_u8(output_activation_max_vec));
5394 
5395             vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
5396             vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
5397                           1);
5398 
5399             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5400             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5401             input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
5402 
5403             output_data += output_depth;
5404           }
5405         }
5406       } else {
5407         TFLITE_DCHECK_EQ(block_height, 1);
5408         // Work through one slice, by row, at a time.
5409         const int8* scratch_data = scratch_block_data;
5410         typename QuantizationTypeImpl<quantization_type>::ExternalType*
5411             output_data = output_block_data + 8 * j_depth;
5412 
5413         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
5414         int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
5415 
5416         // Load first sub-micro block of data into operational banks.
5417         input_bank_a_reg =
5418             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
5419                                            // uninitialized variable.
5420         input_bank_a_reg = vld1q_lane_8x4(
5421             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
5422         input_bank_b_reg = vld1q_dup_s8x4(
5423             scratch_data +
5424             2 * workspace_height_stride);  // Load lane 0, avoiding
5425                                            // uninitialized variable.
5426 
5427         int32x4_t acc0;
5428         int32x4_t acc1;
5429 
5430         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
5431              ++i_width) {
5432           const int output_width =
5433               i_width == output_width_micro_repeats ? residual_width : 2;
5434 
5435           TFLITE_DCHECK_LE(output_width, 2);
5436           TFLITE_DCHECK_GE(output_width, 1);
5437           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
5438           const int8* input_data = scratch_data + 4 + 4 * i_width;
5439 
5440           // Load next sub-micro block of data.
5441           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
5442           input_bank_a_reg = vld1q_lane_8x4(
5443               input_data + workspace_height_stride, input_bank_a_reg, 3);
5444           input_bank_b_reg = vld1q_lane_8x4(
5445               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
5446 
5447           int16x8_t acc_s16_0_1;
5448           uint8x8_t acc_u8_0_1;
5449 
5450           // Iterate over input width shifts within 4x4 blocks.
5451           {
5452             acc0 = adjusted_bias_data_s_0;
5453 
5454             acc0 =
5455                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5456             acc0 =
5457                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5458             acc0 =
5459                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5460 
5461             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5462             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5463                 acc0, -output_shift);
5464 
5465             // Second sub-block accumulation.
5466             acc1 = adjusted_bias_data_s_1;
5467 
5468             acc1 =
5469                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5470             acc1 =
5471                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5472             acc1 =
5473                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5474 
5475             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5476             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5477                 acc1, -output_shift);
5478 
5479             // Add the output offset.
5480             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5481             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5482             // Apply the activation function.
5483             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5484             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5485                                       vget_low_u8(output_activation_min_vec));
5486             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5487                                       vget_low_u8(output_activation_max_vec));
5488 
5489             // This stores the results for both sub-blocks together.
5490             util_vst1_x8(output_data, acc_u8_0_1);
5491 
5492             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5493             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5494 
5495             output_data += output_depth;
5496           }
5497           if (output_width == 2) {
5498             acc0 = adjusted_bias_data_s_0;
5499 
5500             acc0 =
5501                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
5502             acc0 =
5503                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
5504             acc0 =
5505                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
5506 
5507             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
5508             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5509                 acc0, -output_shift);
5510 
5511             // Second sub-block accumulation.
5512             acc1 = adjusted_bias_data_s_1;
5513 
5514             acc1 =
5515                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
5516             acc1 =
5517                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
5518             acc1 =
5519                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
5520 
5521             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
5522             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
5523                 acc1, -output_shift);
5524 
5525             // Add the output offset.
5526             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5527             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5528             // Apply the activation function.
5529             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
5530             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
5531                                       vget_low_u8(output_activation_min_vec));
5532             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
5533                                       vget_low_u8(output_activation_max_vec));
5534 
5535             // This stores the results for both sub-blocks together.
5536             util_vst1_x8(output_data, acc_u8_0_1);
5537 
5538             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
5539             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
5540 
5541             output_data += output_depth;
5542           }
5543         }
5544       }
5545     }
5546   }
5547 
5548   static inline void Run(const int8* scratch_block_data,
5549                          const int8* filter_workspace, const int32* bias_data,
5550                          uint8* output_block_data,
5551                          const DepthwiseConvDotProdParams* function_params) {
5552     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
5553                                output_block_data, function_params);
5554   }
5555 };
5556 
5557 template <>
5558 struct KernelMacroBlock<
5559     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
5560     QuantizationType::kPerChannelInt8,
5561     DepthwiseConvDepthMultiplication::kNoMultiplication,
5562     /*stride=*/1> {
5563   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
5564   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
5565     return vmin_s8(a, b);
5566   }
5567   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
5568     return vmax_s8(a, b);
5569   }
5570   static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
5571     return vminq_s8(a, b);
5572   }
5573   static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
5574     return vmaxq_s8(a, b);
5575   }
5576 
5577   static inline void KernelMacroBlockIntrinsics(
5578       const int8* scratch_block_data, const int8* filter_workspace,
5579       const int32* bias_data, int8* output_block_data,
5580       const DepthwiseConvDotProdParams* function_params) {
5581     static constexpr QuantizationType quantization_type =
5582         QuantizationType::kPerChannelInt8;
5583 
5584     const int workspace_height_stride =
5585         function_params->workspace_height_stride;
5586     const int input_width_overall_micro_repeats =
5587         function_params->input_width_overall_micro_repeats;
5588     const int output_width_micro_repeats =
5589         function_params->output_width_micro_repeats;
5590     const int depth_micro_repeats = function_params->depth_micro_repeats;
5591     const int depth = function_params->input_depth;
5592 
5593     const int output_width_overall_micro_repeats =
5594         function_params->output_width_overall_micro_repeats;
5595     const int block_height = function_params->outbound_block_height;
5596     const int residual_width = function_params->output_residual_width;
5597     const int output_height_stride = function_params->output_height_stride;
5598     constexpr int kBiasIncrement = 4;
5599 
5600     TFLITE_DCHECK(depth_micro_repeats > 0);
5601     const int width_micro_stride = 4 * 8;
5602     const int depth_micro_stride =
5603         width_micro_stride * input_width_overall_micro_repeats;
5604 
5605     const int32 output_activation_min =
5606         function_params->quantized_activation_min;
5607     const int32 output_activation_max =
5608         function_params->quantized_activation_max;
5609     const int32 output_offset = function_params->output_offset;
5610     const int32* output_shift_per_channel =
5611         function_params->output_shift_per_channel;
5612     const int32* output_multiplier_per_channel =
5613         function_params->output_multiplier_per_channel;
5614     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
5615       TFLITE_DCHECK_GE(output_activation_min, 0);
5616       TFLITE_DCHECK_LT(output_activation_min, 256);
5617       TFLITE_DCHECK_GE(output_activation_max, 0);
5618       TFLITE_DCHECK_LT(output_activation_max, 256);
5619     } else {
5620       TFLITE_DCHECK_GE(output_activation_min, -128);
5621       TFLITE_DCHECK_LT(output_activation_min, 128);
5622       TFLITE_DCHECK_GE(output_activation_max, -128);
5623       TFLITE_DCHECK_LT(output_activation_max, 128);
5624       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
5625       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
5626     }
5627     TFLITE_DCHECK_GE(output_offset, -32878);
5628     TFLITE_DCHECK_LT(output_offset, 32768);
5629 
5630     const int16x8_t output_offset_vec =
5631         vdupq_n_s16(static_cast<int16>(output_offset));
5632     const int8x16_t output_activation_min_vec =
5633         vdupq_n_s8(static_cast<int8>(output_activation_min));
5634     const int8x16_t output_activation_max_vec =
5635         vdupq_n_s8(static_cast<int8>(output_activation_max));
5636 
5637     const int8* input_data_depthwise = scratch_block_data;
5638     typename QuantizationTypeImpl<quantization_type>::ExternalType*
5639         output_data_depthwise = output_block_data;
5640     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
5641       // Simulate NEON-register transposition of subset of filter.
5642       int8x16_t filter_reg_0_a;
5643       int8x16_t filter_reg_0_b;
5644       int8x16_t filter_reg_1_a;
5645       int8x16_t filter_reg_1_b;
5646       int8x16_t filter_reg_2_a;
5647       int8x16_t filter_reg_2_b;
5648       int8x16_t filter_reg_0_a_shifted;
5649       int8x16_t filter_reg_1_a_shifted;
5650       int8x16_t filter_reg_2_a_shifted;
5651 
5652       filter_reg_0_a = vld1q_s8(filter_workspace);
5653       filter_workspace += 16;
5654       filter_reg_0_b = vld1q_s8(filter_workspace);
5655       filter_workspace += 16;
5656       filter_reg_1_a = vld1q_s8(filter_workspace);
5657       filter_workspace += 16;
5658       filter_reg_1_b = vld1q_s8(filter_workspace);
5659       filter_workspace += 16;
5660       filter_reg_2_a = vld1q_s8(filter_workspace);
5661       filter_workspace += 16;
5662       filter_reg_2_b = vld1q_s8(filter_workspace);
5663       filter_workspace += 16;
5664 
5665       filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
5666       filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
5667       filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
5668 
5669       if (block_height == 4) {
5670         for (int s = 0; s < 2; ++s) {
5671           // Work through one slice, by row, at a time.
5672           const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
5673           typename QuantizationTypeImpl<quantization_type>::ExternalType*
5674               output_data_base = output_data_depthwise + 4 * s;
5675 
5676           const int8* next_input_data = input_data_base;
5677           typename QuantizationTypeImpl<quantization_type>::ExternalType*
5678               output_data = output_data_base;
5679 
5680           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
5681           bias_data += kBiasIncrement;
5682 
5683           const int32x4_t output_shift =
5684               vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
5685           const int32x4_t output_multiplier =
5686               vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
5687 
5688           // Load first sub-micro block of data into operational banks.
5689           int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
5690           int8x16_t left_bank_1_reg =
5691               vld1q_s8(next_input_data + workspace_height_stride);
5692           int8x16_t left_bank_2_reg =
5693               vld1q_s8(next_input_data + 2 * workspace_height_stride);
5694           int8x16_t left_bank_3_reg =
5695               vld1q_s8(next_input_data + 3 * workspace_height_stride);
5696           int8x16_t left_bank_4_reg =
5697               vld1q_s8(next_input_data + 4 * workspace_height_stride);
5698           int8x16_t left_bank_5_reg =
5699               vld1q_s8(next_input_data + 5 * workspace_height_stride);
5700 
5701           int32x4_t acc0;
5702           int32x4_t acc1;
5703           int32x4_t acc2;
5704           int32x4_t acc3;
5705 
5706           acc0 = adjusted_bias_data;
5707           acc1 = adjusted_bias_data;
5708           acc2 = adjusted_bias_data;
5709           acc3 = adjusted_bias_data;
5710 
5711           acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5712           acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5713           acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5714           acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5715 
5716           for (int i_width = 0; i_width < output_width_micro_repeats;
5717                ++i_width) {
5718             next_input_data += width_micro_stride;
5719 
5720             // Iterate over input width shifts within 4x4 blocks.
5721             {
5722               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5723               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5724               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5725               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5726               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5727               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5728               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5729               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5730 
5731               // Fixed-point multiplication.
5732               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5733               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5734                   acc0, output_shift);
5735               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5736               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5737                   acc1, output_shift);
5738               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5739               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5740                   acc2, output_shift);
5741               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5742               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5743                   acc3, output_shift);
5744               // Add the output offset.
5745               int16x8_t acc_s16_0_1 =
5746                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5747               int16x8_t acc_s16_2_3 =
5748                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5749               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5750               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5751               // Apply the activation function.
5752               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5753                                                  vqmovxn_s16(acc_s16_2_3));
5754               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5755               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5756 
5757               vst1q_lane_8x4(output_data, acc_u8_all, 0);
5758               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5759               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5760                              2);
5761               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5762                              3);
5763 
5764               output_data += depth;
5765             }
5766 
5767             // Load next sub-micro block of data.
5768             int8x16_t right_bank_0_reg;
5769             int8x16_t right_bank_1_reg;
5770             int8x16_t right_bank_2_reg;
5771             int8x16_t right_bank_3_reg;
5772             int8x16_t right_bank_4_reg;
5773             int8x16_t right_bank_5_reg;
5774 
5775             // Loading of next block always valid.
5776             right_bank_0_reg = vld1q_s8(next_input_data);
5777             right_bank_1_reg =
5778                 vld1q_s8(next_input_data + workspace_height_stride);
5779             right_bank_2_reg =
5780                 vld1q_s8(next_input_data + 2 * workspace_height_stride);
5781             right_bank_3_reg =
5782                 vld1q_s8(next_input_data + 3 * workspace_height_stride);
5783             right_bank_4_reg =
5784                 vld1q_s8(next_input_data + 4 * workspace_height_stride);
5785             right_bank_5_reg =
5786                 vld1q_s8(next_input_data + 5 * workspace_height_stride);
5787 
5788             {
5789               acc0 = adjusted_bias_data;
5790               acc1 = adjusted_bias_data;
5791               acc2 = adjusted_bias_data;
5792               acc3 = adjusted_bias_data;
5793 
5794               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
5795               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
5796               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
5797               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
5798               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
5799               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
5800               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
5801               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
5802               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
5803               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
5804               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
5805               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
5806 
5807               // Fixed-point multiplication.
5808               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5809               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5810                   acc0, output_shift);
5811               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5812               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5813                   acc1, output_shift);
5814               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5815               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5816                   acc2, output_shift);
5817               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5818               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5819                   acc3, output_shift);
5820               // Add the output offset.
5821               int16x8_t acc_s16_0_1 =
5822                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5823               int16x8_t acc_s16_2_3 =
5824                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5825               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5826               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5827               // Apply the activation function.
5828               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5829                                                  vqmovxn_s16(acc_s16_2_3));
5830               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5831               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5832 
5833               vst1q_lane_8x4(output_data, acc_u8_all, 0);
5834               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5835               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5836                              2);
5837               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5838                              3);
5839 
5840               left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
5841               left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
5842               left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
5843               left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
5844               left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
5845               left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
5846               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
5847               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
5848               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
5849               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
5850               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
5851               vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
5852 
5853               output_data += depth;
5854             }
5855 
5856             {
5857               acc0 = adjusted_bias_data;
5858               acc1 = adjusted_bias_data;
5859               acc2 = adjusted_bias_data;
5860               acc3 = adjusted_bias_data;
5861 
5862               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
5863               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
5864               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5865               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
5866               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5867               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
5868               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5869               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
5870               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
5871               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5872               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
5873               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
5874 
5875               // Fixed-point multiplication.
5876               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5877               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5878                   acc0, output_shift);
5879               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5880               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5881                   acc1, output_shift);
5882               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5883               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5884                   acc2, output_shift);
5885               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5886               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5887                   acc3, output_shift);
5888               // Add the output offset.
5889               int16x8_t acc_s16_0_1 =
5890                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5891               int16x8_t acc_s16_2_3 =
5892                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5893               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5894               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5895               // Apply the activation function.
5896               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5897                                                  vqmovxn_s16(acc_s16_2_3));
5898               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5899               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5900 
5901               vst1q_lane_8x4(output_data, acc_u8_all, 0);
5902               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5903               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5904                              2);
5905               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5906                              3);
5907 
5908               output_data += depth;
5909             }
5910 
5911             {
5912               acc0 = adjusted_bias_data;
5913               acc1 = adjusted_bias_data;
5914               acc2 = adjusted_bias_data;
5915               acc3 = adjusted_bias_data;
5916 
5917               acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
5918               acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
5919               acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
5920               acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
5921               acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
5922               acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
5923               acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
5924               acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
5925               acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
5926               acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
5927               acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
5928               acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
5929 
5930               // Fixed-point multiplication.
5931               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
5932               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5933                   acc0, output_shift);
5934               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
5935               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5936                   acc1, output_shift);
5937               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
5938               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5939                   acc2, output_shift);
5940               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
5941               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
5942                   acc3, output_shift);
5943               // Add the output offset.
5944               int16x8_t acc_s16_0_1 =
5945                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
5946               int16x8_t acc_s16_2_3 =
5947                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
5948               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
5949               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
5950               // Apply the activation function.
5951               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
5952                                                  vqmovxn_s16(acc_s16_2_3));
5953               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
5954               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
5955 
5956               vst1q_lane_8x4(output_data, acc_u8_all, 0);
5957               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
5958               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
5959                              2);
5960               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
5961                              3);
5962 
5963               left_bank_0_reg = right_bank_0_reg;
5964               left_bank_1_reg = right_bank_1_reg;
5965               left_bank_2_reg = right_bank_2_reg;
5966               left_bank_3_reg = right_bank_3_reg;
5967               left_bank_4_reg = right_bank_4_reg;
5968               left_bank_5_reg = right_bank_5_reg;
5969 
5970               output_data += depth;
5971               acc0 = adjusted_bias_data;
5972               acc1 = adjusted_bias_data;
5973               acc2 = adjusted_bias_data;
5974               acc3 = adjusted_bias_data;
5975 
5976               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
5977               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
5978               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
5979               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
5980             }
5981           }
5982 
5983           if (residual_width > 0) {
5984             next_input_data += width_micro_stride;
5985             const int output_width = residual_width;
5986 
5987             // Load next sub-micro block of data.
5988             int8x16_t right_bank_0_reg;
5989             int8x16_t right_bank_1_reg;
5990             int8x16_t right_bank_2_reg;
5991             int8x16_t right_bank_3_reg;
5992             int8x16_t right_bank_4_reg;
5993             int8x16_t right_bank_5_reg;
5994             // Logic: (output_width - 1) * stride_val < 2.
5995             const bool no_right_block = output_width < 3;
5996 
5997             if (no_right_block) {
5998               // Only needed for sanitizer checks.
5999               right_bank_0_reg = vdupq_n_s8(0);
6000               right_bank_1_reg = vdupq_n_s8(0);
6001               right_bank_2_reg = vdupq_n_s8(0);
6002               right_bank_3_reg = vdupq_n_s8(0);
6003               right_bank_4_reg = vdupq_n_s8(0);
6004               right_bank_5_reg = vdupq_n_s8(0);
6005             } else {
6006               right_bank_0_reg = vld1q_s8(next_input_data);
6007               right_bank_1_reg =
6008                   vld1q_s8(next_input_data + workspace_height_stride);
6009               right_bank_2_reg =
6010                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
6011               right_bank_3_reg =
6012                   vld1q_s8(next_input_data + 3 * workspace_height_stride);
6013               right_bank_4_reg =
6014                   vld1q_s8(next_input_data + 4 * workspace_height_stride);
6015               right_bank_5_reg =
6016                   vld1q_s8(next_input_data + 5 * workspace_height_stride);
6017             }
6018 
6019             // Iterate over input width shifts within 4x4 blocks.
6020             for (int x = 0; x < output_width; ++x) {
6021               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6022               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6023               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
6024               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
6025               acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
6026               acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
6027               acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
6028               acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
6029 
6030               // Fixed-point multiplication.
6031               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6032               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6033                   acc0, output_shift);
6034               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6035               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6036                   acc1, output_shift);
6037               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6038               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6039                   acc2, output_shift);
6040               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6041               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6042                   acc3, output_shift);
6043               // Add the output offset.
6044               int16x8_t acc_s16_0_1 =
6045                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6046               int16x8_t acc_s16_2_3 =
6047                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6048               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6049               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6050               // Apply the activation function.
6051               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
6052                                                  vqmovxn_s16(acc_s16_2_3));
6053               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6054               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6055 
6056               vst1q_lane_8x4(output_data, acc_u8_all, 0);
6057               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
6058               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
6059                              2);
6060               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
6061                              3);
6062 
6063               biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
6064               biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
6065               biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
6066               biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
6067               biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
6068               biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
6069 
6070               output_data += depth;
6071 
6072               acc0 = adjusted_bias_data;
6073               acc1 = adjusted_bias_data;
6074               acc2 = adjusted_bias_data;
6075               acc3 = adjusted_bias_data;
6076 
6077               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6078               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
6079               acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
6080               acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
6081             }
6082           }
6083           input_data_base += 4 * workspace_height_stride;
6084           output_data_base += 4 * output_height_stride;
6085 
6086           // Move to next sub-block: advance to second set of filters, to new
6087           // bias.
6088           filter_reg_0_a = filter_reg_0_b;
6089           filter_reg_1_a = filter_reg_1_b;
6090           filter_reg_2_a = filter_reg_2_b;
6091           filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
6092           filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
6093           filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
6094         }
6095       } else {
6096         const int8* input_data_base = input_data_depthwise;
6097         typename QuantizationTypeImpl<quantization_type>::ExternalType*
6098             output_data_base = output_data_depthwise;
6099 
6100         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6101         bias_data += kBiasIncrement;
6102         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6103         bias_data += kBiasIncrement;
6104 
6105         const int32x4_t output_shift_a =
6106             vld1q_s32(output_shift_per_channel + j_depth * 8);
6107         const int32x4_t output_multiplier_a =
6108             vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6109         const int32x4_t output_shift_b =
6110             vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6111         const int32x4_t output_multiplier_b =
6112             vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6113 
6114         for (int k_height = 0; k_height < block_height; ++k_height) {
6115           const int8* next_input_data = input_data_base;
6116           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6117               output_data = output_data_base;
6118 
6119           // Load first sub-micro block of data into operational banks.
6120           int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
6121           int8x16_t left_bank_1_reg_a =
6122               vld1q_s8(next_input_data + workspace_height_stride);
6123           int8x16_t left_bank_2_reg_a =
6124               vld1q_s8(next_input_data + 2 * workspace_height_stride);
6125           int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6126           int8x16_t left_bank_1_reg_b =
6127               vld1q_s8(next_input_data + workspace_height_stride + 16);
6128           int8x16_t left_bank_2_reg_b =
6129               vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6130 
6131           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6132                ++i_width) {
6133             next_input_data += width_micro_stride;
6134             const int output_width =
6135                 i_width == output_width_micro_repeats ? residual_width : 4;
6136 
6137             int8x16_t right_bank_0_reg_a;
6138             int8x16_t right_bank_1_reg_a;
6139             int8x16_t right_bank_2_reg_a;
6140             int8x16_t right_bank_0_reg_b;
6141             int8x16_t right_bank_1_reg_b;
6142             int8x16_t right_bank_2_reg_b;
6143             // Logic: (output_width - 1) * stride_val < 2.
6144             const bool no_right_block = output_width < 3;
6145 
6146             // Load next sub-micro block of data.
6147             if (no_right_block) {
6148               // Only needed for sanitizer checks.
6149               right_bank_0_reg_a = vdupq_n_s8(0);
6150               right_bank_1_reg_a = vdupq_n_s8(0);
6151               right_bank_2_reg_a = vdupq_n_s8(0);
6152               right_bank_0_reg_b = vdupq_n_s8(0);
6153               right_bank_1_reg_b = vdupq_n_s8(0);
6154               right_bank_2_reg_b = vdupq_n_s8(0);
6155             } else {
6156               right_bank_0_reg_a = vld1q_s8(next_input_data);
6157               right_bank_1_reg_a =
6158                   vld1q_s8(next_input_data + workspace_height_stride);
6159               right_bank_2_reg_a =
6160                   vld1q_s8(next_input_data + 2 * workspace_height_stride);
6161               right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
6162               right_bank_1_reg_b =
6163                   vld1q_s8(next_input_data + workspace_height_stride + 16);
6164               right_bank_2_reg_b =
6165                   vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
6166             }
6167 
6168             // Iterate over input width shifts within 4x4 blocks.
6169             for (int x = 0; x < output_width; ++x) {
6170               int32x4_t acc_a = adjusted_bias_data_a;
6171               int32x4_t acc_b = adjusted_bias_data_b;
6172               acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
6173               acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
6174               acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
6175               acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
6176               acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
6177               acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
6178 
6179               // Fixed-point multiplication.
6180               acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
6181               acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
6182               acc_a =
6183                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6184                       acc_a, output_shift_a);
6185               acc_b =
6186                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6187                       acc_b, output_shift_b);
6188               // Add the output offset.
6189               int16x8_t acc_s16_0_0 =
6190                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
6191               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
6192               // Apply the activation function.
6193               int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
6194               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
6195                                         vget_low_s8(output_activation_min_vec));
6196               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
6197                                         vget_low_s8(output_activation_max_vec));
6198 
6199               vst1_s8(output_data, acc_u8_0_0);
6200 
6201               biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
6202               biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
6203               biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
6204               biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
6205               biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
6206               biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
6207 
6208               output_data += depth;
6209             }
6210           }
6211           input_data_base += workspace_height_stride;
6212           output_data_base += output_height_stride;
6213         }
6214       }
6215       input_data_depthwise += depth_micro_stride;
6216       output_data_depthwise += 8;
6217     }
6218   }  // NOLINT(readability/fn_size) Manually unrolled.
6219 
6220   static inline void Run(const int8* scratch_block_data,
6221                          const int8* filter_workspace, const int32* bias_data,
6222                          int8* output_block_data,
6223                          const DepthwiseConvDotProdParams* function_params) {
6224     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6225                                output_block_data, function_params);
6226   }
6227 };
6228 
6229 template <>
6230 struct KernelMacroBlock<
6231     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6232     QuantizationType::kPerChannelInt8,
6233     DepthwiseConvDepthMultiplication::kNoMultiplication,
6234     /*stride=*/2> {
6235   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6236   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6237     return vmin_s8(a, b);
6238   }
6239   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6240     return vmax_s8(a, b);
6241   }
6242 
6243   static inline void KernelMacroBlockIntrinsics(
6244       const int8* scratch_block_data, const int8* filter_workspace,
6245       const int32* bias_data, int8* output_block_data,
6246       const DepthwiseConvDotProdParams* function_params) {
6247     static constexpr QuantizationType quantization_type =
6248         QuantizationType::kPerChannelInt8;
6249 
6250     const int workspace_height_stride =
6251         function_params->workspace_height_stride;
6252     const int input_width_overall_micro_repeats =
6253         function_params->input_width_overall_micro_repeats;
6254     const int output_width_micro_repeats =
6255         function_params->output_width_micro_repeats;
6256     const int depth_micro_repeats = function_params->depth_micro_repeats;
6257     const int depth = function_params->input_depth;
6258     constexpr int kStrideVal = 2;
6259     constexpr int kFourOverStride = 2;
6260     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
6261     TFLITE_DCHECK_EQ(function_params->four_over_stride, kFourOverStride);
6262 
6263     const int workspace_width_micro_repeats =
6264         function_params->workspace_width_micro_repeats;
6265     const int output_width_overall_micro_repeats =
6266         function_params->output_width_overall_micro_repeats;
6267     const int block_height = function_params->outbound_block_height;
6268     const int residual_width = function_params->output_residual_width;
6269     const int output_height_stride = function_params->output_height_stride;
6270     constexpr int kBiasIncrement = 4;
6271 
6272     TFLITE_DCHECK(depth_micro_repeats > 0);
6273     const int width_micro_stride = 4 * 8;
6274     const int depth_micro_stride =
6275         width_micro_stride * input_width_overall_micro_repeats;
6276 
6277     const int32 output_activation_min =
6278         function_params->quantized_activation_min;
6279     const int32 output_activation_max =
6280         function_params->quantized_activation_max;
6281     const int32 output_offset = function_params->output_offset;
6282     const int32* output_shift_per_channel =
6283         function_params->output_shift_per_channel;
6284     const int32* output_multiplier_per_channel =
6285         function_params->output_multiplier_per_channel;
6286     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6287       TFLITE_DCHECK_GE(output_activation_min, 0);
6288       TFLITE_DCHECK_LT(output_activation_min, 256);
6289       TFLITE_DCHECK_GE(output_activation_max, 0);
6290       TFLITE_DCHECK_LT(output_activation_max, 256);
6291     } else {
6292       TFLITE_DCHECK_GE(output_activation_min, -128);
6293       TFLITE_DCHECK_LT(output_activation_min, 128);
6294       TFLITE_DCHECK_GE(output_activation_max, -128);
6295       TFLITE_DCHECK_LT(output_activation_max, 128);
6296       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6297       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6298     }
6299     TFLITE_DCHECK_GE(output_offset, -32878);
6300     TFLITE_DCHECK_LT(output_offset, 32768);
6301 
6302     // This version only does min/max on 64 bits.
6303     const int16x8_t output_offset_vec =
6304         vdupq_n_s16(static_cast<int16>(output_offset));
6305     const int8x8_t output_activation_min_vec =
6306         vdup_n_s8(static_cast<int8>(output_activation_min));
6307     const int8x8_t output_activation_max_vec =
6308         vdup_n_s8(static_cast<int8>(output_activation_max));
6309 
6310     constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
6311 
6312     TFLITE_DCHECK_LE(block_height, 2);
6313 
6314     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6315       const int8* filter_block =
6316           filter_workspace + shuffled_filter_increment * j_depth;
6317 
6318       if (block_height == 2) {
6319         for (int s = 0; s < 2; ++s) {
6320           // Simulate NEON-register transposition of subset of filter.
6321           int8x16_t filter_reg_0_a;
6322           int8x16_t filter_reg_1_a;
6323           int8x16_t filter_reg_2_a;
6324 
6325           filter_reg_0_a = vld1q_s8(filter_block + s * 16);
6326           filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
6327           filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
6328 
6329           const int8* scratch_data =
6330               scratch_block_data + depth_micro_stride * j_depth;
6331           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6332               output_data = output_block_data + 8 * j_depth;
6333           const int8* input_data_0 = scratch_data + s * 2 * 8;
6334 
6335           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6336 
6337           const int32x4_t output_shift =
6338               vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6339           const int32x4_t output_multiplier =
6340               vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6341 
6342           // Load first sub-micro block of data into operational banks.
6343           int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
6344           int8x16_t left_bank_1_reg =
6345               vld1q_s8(input_data_0 + workspace_height_stride);
6346           int8x16_t left_bank_2_reg =
6347               vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6348           int8x16_t left_bank_3_reg =
6349               vld1q_s8(input_data_0 + 3 * workspace_height_stride);
6350           int8x16_t left_bank_4_reg =
6351               vld1q_s8(input_data_0 + 4 * workspace_height_stride);
6352 
6353           int8x16_t right_bank_0_reg;
6354           int8x16_t right_bank_1_reg;
6355           int8x16_t right_bank_2_reg;
6356           int8x16_t right_bank_3_reg;
6357           int8x16_t right_bank_4_reg;
6358 
6359           int32x4_t acc0;
6360           int32x4_t acc1;
6361           int16x8_t acc_s16_0_1;
6362           int8x8_t acc_u8;
6363 
6364           int i_width = 0;
6365 
6366           // When output_width_micro_repeats <
6367           // output_width_overall_micro_repeats, 0 < residual_width <= 2, and so
6368           // residual_width == 1 is then true iff residual_width < 2.
6369           const int adjusted_width_micro_repeats =
6370               (output_width_micro_repeats <
6371                output_width_overall_micro_repeats) &&
6372                       (residual_width == 1)
6373                   ? output_width_micro_repeats
6374                   : output_width_overall_micro_repeats;
6375 
6376           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
6377             const int output_width = kFourOverStride;
6378             TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6379             const int8* input_data =
6380                 input_data_0 + width_micro_stride * i_width;
6381             acc0 = adjusted_bias_data;
6382             acc1 = adjusted_bias_data;
6383             right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
6384             right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
6385                                         workspace_height_stride);
6386 
6387             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6388             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6389             typename QuantizationTypeImpl<quantization_type>::ExternalType*
6390                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6391 
6392             right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
6393                                         2 * workspace_height_stride);
6394             right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
6395                                         3 * workspace_height_stride);
6396             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6397             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6398             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6399             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6400             right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
6401                                         4 * workspace_height_stride);
6402 
6403             // Fixed-point multiplication.
6404             acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6405             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6406                 acc0, output_shift);
6407             acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6408             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6409                 acc1, output_shift);
6410             // Add the output offset.
6411             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6412             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6413             // Apply the activation function.
6414             acc_u8 = vqmovxn_s16(acc_s16_0_1);
6415             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6416             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6417 
6418             left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
6419             left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
6420             left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
6421             left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
6422             left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
6423             acc0 = adjusted_bias_data;
6424             acc1 = adjusted_bias_data;
6425             vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6426             vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6427             vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6428             vst1_lane_8x4(output_data_base, acc_u8, 0);
6429             vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
6430 
6431             vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6432             vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6433 
6434             acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6435             acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6436             acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6437             acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6438             acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6439             acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6440 
6441             // Fixed-point multiplication.
6442             acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6443             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6444                 acc0, output_shift);
6445             acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6446             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6447                 acc1, output_shift);
6448             // Add the output offset.
6449             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6450             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6451             // Apply the activation function.
6452             acc_u8 = vqmovxn_s16(acc_s16_0_1);
6453             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6454             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6455 
6456             vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
6457             vst1_lane_8x4(output_data_base + depth + output_height_stride,
6458                           acc_u8, 1);
6459 
6460             left_bank_0_reg = right_bank_0_reg;
6461             left_bank_1_reg = right_bank_1_reg;
6462             left_bank_2_reg = right_bank_2_reg;
6463             left_bank_3_reg = right_bank_3_reg;
6464             left_bank_4_reg = right_bank_4_reg;
6465           }
6466           for (; i_width < output_width_overall_micro_repeats; ++i_width) {
6467             TFLITE_DCHECK_NE(residual_width, kFourOverStride);
6468 
6469             // No need to load next ("right") block of data.
6470 
6471             typename QuantizationTypeImpl<quantization_type>::ExternalType*
6472                 output_data_base = output_data + depth * 2 * i_width + 4 * s;
6473 
6474             // Iterate over input width shifts within 4x4 blocks.
6475             {
6476               acc0 = adjusted_bias_data;
6477               acc1 = adjusted_bias_data;
6478 
6479               acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
6480               acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
6481               acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
6482               acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
6483               acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
6484               acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
6485 
6486               // Fixed-point multiplication.
6487               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6488               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6489                   acc0, output_shift);
6490               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6491               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6492                   acc1, output_shift);
6493               // Add the output offset.
6494               int16x8_t acc_s16_0_1 =
6495                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6496               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6497               // Apply the activation function.
6498               int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6499               acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6500               acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6501 
6502               vst1_lane_8x4(output_data_base, acc_u8, 0);
6503               vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
6504 
6505               left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
6506               left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
6507               left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
6508               left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
6509               left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
6510               vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
6511               vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
6512               vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
6513               vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
6514               vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
6515             }
6516           }
6517           bias_data += kBiasIncrement;
6518         }
6519       } else {
6520         // block_height == 1.
6521         int8x16_t filter_reg_0_a;
6522         int8x16_t filter_reg_1_a;
6523         int8x16_t filter_reg_2_a;
6524         int8x16_t filter_reg_0_b;
6525         int8x16_t filter_reg_1_b;
6526         int8x16_t filter_reg_2_b;
6527 
6528         filter_reg_0_a = vld1q_s8(filter_block);
6529         filter_reg_1_a = vld1q_s8(filter_block + 32);
6530         filter_reg_2_a = vld1q_s8(filter_block + 64);
6531         filter_reg_0_b = vld1q_s8(filter_block + 16);
6532         filter_reg_1_b = vld1q_s8(filter_block + 16 + 32);
6533         filter_reg_2_b = vld1q_s8(filter_block + 16 + 64);
6534 
6535         const int8* scratch_data =
6536             scratch_block_data + depth_micro_stride * j_depth;
6537         typename QuantizationTypeImpl<quantization_type>::ExternalType*
6538             output_data = output_block_data + 8 * j_depth;
6539         const int8* input_data_0 = scratch_data;
6540 
6541         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
6542         bias_data += kBiasIncrement;
6543         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
6544         bias_data += kBiasIncrement;
6545 
6546         const int32x4_t output_shift_a =
6547             vld1q_s32(output_shift_per_channel + j_depth * 8);
6548         const int32x4_t output_multiplier_a =
6549             vld1q_s32(output_multiplier_per_channel + j_depth * 8);
6550         const int32x4_t output_shift_b =
6551             vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
6552         const int32x4_t output_multiplier_b =
6553             vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
6554 
6555         // Load first sub-micro block of data into operational banks.
6556         int8x16_t left_bank_0_reg_a = vld1q_s8(input_data_0);
6557         int8x16_t left_bank_1_reg_a =
6558             vld1q_s8(input_data_0 + workspace_height_stride);
6559         int8x16_t left_bank_2_reg_a =
6560             vld1q_s8(input_data_0 + 2 * workspace_height_stride);
6561         int8x16_t left_bank_0_reg_b = vld1q_s8(input_data_0 + 16);
6562         int8x16_t left_bank_1_reg_b =
6563             vld1q_s8(input_data_0 + workspace_height_stride + 16);
6564         int8x16_t left_bank_2_reg_b =
6565             vld1q_s8(input_data_0 + 2 * workspace_height_stride + 16);
6566 
6567         int8x16_t right_bank_0_reg_a;
6568         int8x16_t right_bank_1_reg_a;
6569         int8x16_t right_bank_2_reg_a;
6570         int8x16_t right_bank_0_reg_b;
6571         int8x16_t right_bank_1_reg_b;
6572         int8x16_t right_bank_2_reg_b;
6573 
6574         int32x4_t acc0_a;
6575         int32x4_t acc0_b;
6576 
6577         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
6578              ++i_width) {
6579           const int output_width = i_width == output_width_micro_repeats
6580                                        ? residual_width
6581                                        : kFourOverStride;
6582           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
6583           const int8* input_data = input_data_0 + width_micro_stride * i_width;
6584           const bool no_right_block = i_width == output_width_micro_repeats &&
6585                                       output_width_overall_micro_repeats ==
6586                                           workspace_width_micro_repeats;
6587 
6588           if (!no_right_block) {
6589             // Load next sub-micro block of data.
6590             right_bank_0_reg_a = vld1q_s8(input_data + width_micro_stride);
6591             right_bank_1_reg_a = vld1q_s8(input_data + width_micro_stride +
6592                                           workspace_height_stride);
6593             right_bank_2_reg_a = vld1q_s8(input_data + width_micro_stride +
6594                                           2 * workspace_height_stride);
6595             right_bank_0_reg_b = vld1q_s8(input_data + width_micro_stride + 16);
6596             right_bank_1_reg_b = vld1q_s8(input_data + width_micro_stride +
6597                                           workspace_height_stride + 16);
6598             right_bank_2_reg_b = vld1q_s8(input_data + width_micro_stride +
6599                                           2 * workspace_height_stride + 16);
6600           }
6601 
6602           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6603               output_data_base = output_data + depth * 2 * i_width;
6604 
6605           // Iterate over input width shifts within 4x4 blocks.
6606           {
6607             acc0_a = adjusted_bias_data_a;
6608             acc0_b = adjusted_bias_data_b;
6609 
6610             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6611             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6612             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6613             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6614             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6615             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6616 
6617             // Fixed-point multiplication.
6618             acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6619             acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6620             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6621                 acc0_a, output_shift_a);
6622             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6623                 acc0_b, output_shift_b);
6624             // Add the output offset.
6625             int16x8_t acc_s16_0_1 =
6626                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6627             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6628             // Apply the activation function.
6629             int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6630             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6631             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6632 
6633             vst1_s8(output_data_base, acc_u8);
6634 
6635             left_bank_0_reg_a = vrev32q_u16(left_bank_0_reg_a);
6636             left_bank_1_reg_a = vrev32q_u16(left_bank_1_reg_a);
6637             left_bank_2_reg_a = vrev32q_u16(left_bank_2_reg_a);
6638             left_bank_0_reg_b = vrev32q_u16(left_bank_0_reg_b);
6639             left_bank_1_reg_b = vrev32q_u16(left_bank_1_reg_b);
6640             left_bank_2_reg_b = vrev32q_u16(left_bank_2_reg_b);
6641             vtrn1_s8x2_in_place(&left_bank_0_reg_a, &right_bank_0_reg_a);
6642             vtrn1_s8x2_in_place(&left_bank_1_reg_a, &right_bank_1_reg_a);
6643             vtrn1_s8x2_in_place(&left_bank_2_reg_a, &right_bank_2_reg_a);
6644             vtrn1_s8x2_in_place(&left_bank_0_reg_b, &right_bank_0_reg_b);
6645             vtrn1_s8x2_in_place(&left_bank_1_reg_b, &right_bank_1_reg_b);
6646             vtrn1_s8x2_in_place(&left_bank_2_reg_b, &right_bank_2_reg_b);
6647           }
6648 
6649           if (output_width > 1) {
6650             acc0_a = adjusted_bias_data_a;
6651             acc0_b = adjusted_bias_data_b;
6652 
6653             acc0_a = vdotq_s32(acc0_a, filter_reg_0_a, left_bank_0_reg_a);
6654             acc0_a = vdotq_s32(acc0_a, filter_reg_1_a, left_bank_1_reg_a);
6655             acc0_a = vdotq_s32(acc0_a, filter_reg_2_a, left_bank_2_reg_a);
6656             acc0_b = vdotq_s32(acc0_b, filter_reg_0_b, left_bank_0_reg_b);
6657             acc0_b = vdotq_s32(acc0_b, filter_reg_1_b, left_bank_1_reg_b);
6658             acc0_b = vdotq_s32(acc0_b, filter_reg_2_b, left_bank_2_reg_b);
6659 
6660             // Fixed-point multiplication.
6661             acc0_a = vqrdmulhq_s32(acc0_a, output_multiplier_a);
6662             acc0_b = vqrdmulhq_s32(acc0_b, output_multiplier_b);
6663             acc0_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6664                 acc0_a, output_shift_a);
6665             acc0_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6666                 acc0_b, output_shift_b);
6667             // Add the output offset.
6668             int16x8_t acc_s16_0_1 =
6669                 vcombine_s16(vqmovn_s32(acc0_a), vqmovn_s32(acc0_b));
6670             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6671             // Apply the activation function.
6672             int8x8_t acc_u8 = vqmovxn_s16(acc_s16_0_1);
6673             acc_u8 = util_vmax_x8(acc_u8, output_activation_min_vec);
6674             acc_u8 = util_vmin_x8(acc_u8, output_activation_max_vec);
6675 
6676             vst1_s8(output_data_base + depth, acc_u8);
6677 
6678             left_bank_0_reg_a = right_bank_0_reg_a;
6679             left_bank_1_reg_a = right_bank_1_reg_a;
6680             left_bank_2_reg_a = right_bank_2_reg_a;
6681             left_bank_0_reg_b = right_bank_0_reg_b;
6682             left_bank_1_reg_b = right_bank_1_reg_b;
6683             left_bank_2_reg_b = right_bank_2_reg_b;
6684           }
6685         }
6686       }
6687     }
6688   }  // NOLINT(readability/fn_size) Manually unrolled.
6689 
6690   static inline void Run(const int8* scratch_block_data,
6691                          const int8* filter_workspace, const int32* bias_data,
6692                          int8* output_block_data,
6693                          const DepthwiseConvDotProdParams* function_params) {
6694     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
6695                                output_block_data, function_params);
6696   }
6697 };
6698 
6699 template <>
6700 struct KernelMacroBlock<
6701     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
6702     QuantizationType::kPerChannelInt8,
6703     DepthwiseConvDepthMultiplication::kUnitInputDepth,
6704     /*stride=*/1> {
6705   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
6706   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
6707     return vmin_s8(a, b);
6708   }
6709   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
6710     return vmax_s8(a, b);
6711   }
6712   static inline int8x16_t util_vminq_x8(int8x16_t a, int8x16_t b) {
6713     return vminq_s8(a, b);
6714   }
6715   static inline int8x16_t util_vmaxq_x8(int8x16_t a, int8x16_t b) {
6716     return vmaxq_s8(a, b);
6717   }
6718 
6719   static inline void KernelMacroBlockIntrinsics(
6720       const int8* scratch_block_data, const int8* filter_workspace,
6721       const int32* bias_data, int8* output_block_data,
6722       const DepthwiseConvDotProdParams* function_params) {
6723     static constexpr QuantizationType quantization_type =
6724         QuantizationType::kPerChannelInt8;
6725 
6726     TFLITE_DCHECK_EQ(function_params->stride, 1);
6727     const int workspace_height_stride =
6728         function_params->workspace_height_stride;
6729     const int output_width_micro_repeats =
6730         function_params->output_width_micro_repeats;
6731     const int depth_micro_repeats = function_params->depth_micro_repeats;
6732     const int output_depth = function_params->output_depth;
6733 
6734     const int output_width_overall_micro_repeats =
6735         function_params->output_width_overall_micro_repeats;
6736     const int block_height = function_params->outbound_block_height;
6737     const int residual_width = function_params->output_residual_width;
6738     const int output_height_stride = function_params->output_height_stride;
6739     constexpr int kBiasIncrement = 4;
6740 
6741     TFLITE_DCHECK(depth_micro_repeats > 0);
6742 
6743     const int32 output_activation_min =
6744         function_params->quantized_activation_min;
6745     const int32 output_activation_max =
6746         function_params->quantized_activation_max;
6747     const int32 output_offset = function_params->output_offset;
6748     const int32* output_shift_per_channel =
6749         function_params->output_shift_per_channel;
6750     const int32* output_multiplier_per_channel =
6751         function_params->output_multiplier_per_channel;
6752     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
6753       TFLITE_DCHECK_GE(output_activation_min, 0);
6754       TFLITE_DCHECK_LT(output_activation_min, 256);
6755       TFLITE_DCHECK_GE(output_activation_max, 0);
6756       TFLITE_DCHECK_LT(output_activation_max, 256);
6757     } else {
6758       TFLITE_DCHECK_GE(output_activation_min, -128);
6759       TFLITE_DCHECK_LT(output_activation_min, 128);
6760       TFLITE_DCHECK_GE(output_activation_max, -128);
6761       TFLITE_DCHECK_LT(output_activation_max, 128);
6762       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
6763       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
6764     }
6765     TFLITE_DCHECK_GE(output_offset, -32878);
6766     TFLITE_DCHECK_LT(output_offset, 32768);
6767 
6768     const int16x8_t output_offset_vec =
6769         vdupq_n_s16(static_cast<int16>(output_offset));
6770     const int8x16_t output_activation_min_vec =
6771         vdupq_n_s8(static_cast<int8>(output_activation_min));
6772     const int8x16_t output_activation_max_vec =
6773         vdupq_n_s8(static_cast<int8>(output_activation_max));
6774 
6775     typename QuantizationTypeImpl<quantization_type>::ExternalType*
6776         output_data_depthwise = output_block_data;
6777     for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
6778       // Simulate NEON-register transposition of subset of filter.
6779       int8x16_t filter_reg_0_a;
6780       int8x16_t filter_reg_0_b;
6781       int8x16_t filter_reg_1_a;
6782       int8x16_t filter_reg_1_b;
6783       int8x16_t filter_reg_2_a;
6784       int8x16_t filter_reg_2_b;
6785       int8x16_t filter_reg_0_a_shifted;
6786       int8x16_t filter_reg_1_a_shifted;
6787       int8x16_t filter_reg_2_a_shifted;
6788 
6789       filter_reg_0_a = vld1q_s8(filter_workspace);
6790       filter_workspace += 16;
6791       filter_reg_0_b = vld1q_s8(filter_workspace);
6792       filter_workspace += 16;
6793       filter_reg_1_a = vld1q_s8(filter_workspace);
6794       filter_workspace += 16;
6795       filter_reg_1_b = vld1q_s8(filter_workspace);
6796       filter_workspace += 16;
6797       filter_reg_2_a = vld1q_s8(filter_workspace);
6798       filter_workspace += 16;
6799       filter_reg_2_b = vld1q_s8(filter_workspace);
6800       filter_workspace += 16;
6801 
6802       filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
6803       filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
6804       filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
6805 
6806       // When output_width_micro_repeats < output_width_overall_micro_repeats,
6807       // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
6808       // residual_width < 2.
6809       const int adjusted_width_micro_repeats =
6810           (output_width_micro_repeats < output_width_overall_micro_repeats) &&
6811                   (residual_width < 4)
6812               ? output_width_micro_repeats
6813               : output_width_overall_micro_repeats;
6814 
6815       if (block_height == 4) {
6816         for (int s = 0; s < 2; ++s) {
6817           // Work through one slice, by row, at a time.
6818           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6819               output_data_base = output_data_depthwise + 4 * s;
6820 
6821           const int8* next_input_data = scratch_block_data;
6822           typename QuantizationTypeImpl<quantization_type>::ExternalType*
6823               output_data = output_data_base;
6824 
6825           const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
6826           bias_data += kBiasIncrement;
6827 
6828           const int32x4_t output_shift =
6829               vld1q_s32(output_shift_per_channel + j_depth * 8 + 4 * s);
6830           const int32x4_t output_multiplier =
6831               vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4 * s);
6832 
6833           int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
6834           int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
6835           int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
6836 
6837           // Load first sub-micro block of data into operational banks.
6838           input_bank_a_reg =
6839               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
6840                                                 // uninitialized variable.
6841           input_bank_a_reg = vld1q_lane_8x4(
6842               next_input_data + workspace_height_stride, input_bank_a_reg, 2);
6843           input_bank_b_reg = vld1q_dup_s8x4(
6844               next_input_data +
6845               2 * workspace_height_stride);  // Load lane 0, avoiding
6846                                              // uninitialized variable.
6847           input_bank_b_reg =
6848               vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
6849                              input_bank_b_reg, 2);
6850           input_bank_c_reg = vld1q_dup_s8x4(
6851               next_input_data +
6852               4 * workspace_height_stride);  // Load lane 0, avoiding
6853                                              // uninitialized variable.
6854           input_bank_c_reg =
6855               vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
6856                              input_bank_c_reg, 2);
6857 
6858           int32x4_t acc0;
6859           int32x4_t acc1;
6860           int32x4_t acc2;
6861           int32x4_t acc3;
6862 
6863           acc0 = adjusted_bias_data;
6864           acc1 = adjusted_bias_data;
6865           acc2 = adjusted_bias_data;
6866           acc3 = adjusted_bias_data;
6867 
6868           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
6869           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
6870           acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
6871           acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
6872 
6873           int i_width = 0;
6874           for (; i_width < adjusted_width_micro_repeats; ++i_width) {
6875             next_input_data += 4;
6876 
6877             // Iterate over input width shifts within 4x4 blocks.
6878             {
6879               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
6880                                          0);
6881               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
6882                                          2);
6883               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
6884                                          2);
6885               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
6886                                          2);
6887               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
6888                                          2);
6889               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
6890                                          0);
6891               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
6892                                          0);
6893               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
6894                                          2);
6895 
6896               // Fixed-point multiplication.
6897               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6898               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6899                   acc0, output_shift);
6900               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6901               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6902                   acc1, output_shift);
6903               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6904               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6905                   acc2, output_shift);
6906               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6907               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6908                   acc3, output_shift);
6909               // Add the output offset.
6910               int16x8_t acc_s16_0_1 =
6911                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6912               int16x8_t acc_s16_2_3 =
6913                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6914               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6915               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
6916               // Apply the activation function.
6917               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
6918                                                  vqmovxn_s16(acc_s16_2_3));
6919               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
6920               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
6921 
6922               vst1q_lane_8x4(output_data, acc_u8_all, 0);
6923               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
6924               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
6925                              2);
6926               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
6927                              3);
6928 
6929               output_data += output_depth;
6930             }
6931             // Load next sub-micro block of data.
6932             input_bank_a_reg =
6933                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
6934             input_bank_a_reg = vld1q_lane_8x4(
6935                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
6936             input_bank_b_reg =
6937                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
6938                                input_bank_b_reg, 1);
6939             input_bank_b_reg =
6940                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
6941                                input_bank_b_reg, 3);
6942             input_bank_c_reg =
6943                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
6944                                input_bank_c_reg, 1);
6945             input_bank_c_reg =
6946                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
6947                                input_bank_c_reg, 3);
6948 
6949             {
6950               acc0 = adjusted_bias_data;
6951               acc1 = adjusted_bias_data;
6952               acc2 = adjusted_bias_data;
6953               acc3 = adjusted_bias_data;
6954 
6955               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
6956                                          input_bank_a_reg, 0);
6957               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
6958                                          input_bank_a_reg, 2);
6959               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
6960                                          input_bank_b_reg, 0);
6961               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
6962                                          input_bank_a_reg, 2);
6963               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
6964                                          input_bank_b_reg, 0);
6965               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
6966                                          input_bank_b_reg, 2);
6967               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
6968                                          input_bank_b_reg, 0);
6969               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
6970                                          input_bank_b_reg, 2);
6971               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
6972                                          input_bank_c_reg, 0);
6973               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
6974                                          input_bank_b_reg, 2);
6975               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
6976                                          input_bank_c_reg, 0);
6977               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
6978                                          input_bank_c_reg, 2);
6979 
6980               // Fixed-point multiplication.
6981               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
6982               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6983                   acc0, output_shift);
6984               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
6985               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6986                   acc1, output_shift);
6987               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
6988               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6989                   acc2, output_shift);
6990               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
6991               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
6992                   acc3, output_shift);
6993               // Add the output offset.
6994               int16x8_t acc_s16_0_1 =
6995                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
6996               int16x8_t acc_s16_2_3 =
6997                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
6998               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
6999               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7000               // Apply the activation function.
7001               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7002                                                  vqmovxn_s16(acc_s16_2_3));
7003               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7004               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7005 
7006               vst1q_lane_8x4(output_data, acc_u8_all, 0);
7007               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7008               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7009                              2);
7010               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7011                              3);
7012 
7013               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7014               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7015               input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7016 
7017               output_data += output_depth;
7018             }
7019 
7020             {
7021               acc0 = adjusted_bias_data;
7022               acc1 = adjusted_bias_data;
7023               acc2 = adjusted_bias_data;
7024               acc3 = adjusted_bias_data;
7025 
7026               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7027                                          0);
7028               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7029                                          2);
7030               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7031                                          0);
7032               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7033                                          2);
7034               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7035                                          0);
7036               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7037                                          2);
7038               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7039                                          0);
7040               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7041                                          2);
7042               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7043                                          0);
7044               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7045                                          2);
7046               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7047                                          0);
7048               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7049                                          2);
7050 
7051               // Fixed-point multiplication.
7052               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7053               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7054                   acc0, output_shift);
7055               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7056               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7057                   acc1, output_shift);
7058               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7059               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7060                   acc2, output_shift);
7061               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7062               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7063                   acc3, output_shift);
7064               // Add the output offset.
7065               int16x8_t acc_s16_0_1 =
7066                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7067               int16x8_t acc_s16_2_3 =
7068                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7069               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7070               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7071               // Apply the activation function.
7072               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7073                                                  vqmovxn_s16(acc_s16_2_3));
7074               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7075               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7076 
7077               vst1q_lane_8x4(output_data, acc_u8_all, 0);
7078               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7079               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7080                              2);
7081               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7082                              3);
7083 
7084               output_data += output_depth;
7085             }
7086 
7087             {
7088               acc0 = adjusted_bias_data;
7089               acc1 = adjusted_bias_data;
7090               acc2 = adjusted_bias_data;
7091               acc3 = adjusted_bias_data;
7092 
7093               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
7094                                          input_bank_a_reg, 0);
7095               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
7096                                          input_bank_a_reg, 2);
7097               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
7098                                          input_bank_b_reg, 0);
7099               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
7100                                          input_bank_a_reg, 2);
7101               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
7102                                          input_bank_b_reg, 0);
7103               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
7104                                          input_bank_b_reg, 2);
7105               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
7106                                          input_bank_b_reg, 0);
7107               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
7108                                          input_bank_b_reg, 2);
7109               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
7110                                          input_bank_c_reg, 0);
7111               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
7112                                          input_bank_b_reg, 2);
7113               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
7114                                          input_bank_c_reg, 0);
7115               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
7116                                          input_bank_c_reg, 2);
7117 
7118               // Fixed-point multiplication.
7119               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7120               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7121                   acc0, output_shift);
7122               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7123               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7124                   acc1, output_shift);
7125               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7126               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7127                   acc2, output_shift);
7128               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7129               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7130                   acc3, output_shift);
7131               // Add the output offset.
7132               int16x8_t acc_s16_0_1 =
7133                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7134               int16x8_t acc_s16_2_3 =
7135                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7136               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7137               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7138               // Apply the activation function.
7139               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7140                                                  vqmovxn_s16(acc_s16_2_3));
7141               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7142               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7143 
7144               vst1q_lane_8x4(output_data, acc_u8_all, 0);
7145               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7146               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7147                              2);
7148               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7149                              3);
7150 
7151               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7152               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7153               input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7154 
7155               output_data += output_depth;
7156               acc0 = adjusted_bias_data;
7157               acc1 = adjusted_bias_data;
7158               acc2 = adjusted_bias_data;
7159               acc3 = adjusted_bias_data;
7160 
7161               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7162                                          0);
7163               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7164                                          0);
7165               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7166                                          0);
7167               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7168                                          2);
7169             }
7170           }
7171 
7172           if (i_width < output_width_overall_micro_repeats) {
7173             next_input_data += 4;
7174             const int output_width = residual_width;
7175 
7176             // Load next sub-micro block of data.
7177             input_bank_a_reg =
7178                 vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
7179             input_bank_a_reg = vld1q_lane_8x4(
7180                 next_input_data + workspace_height_stride, input_bank_a_reg, 3);
7181             input_bank_b_reg =
7182                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7183                                input_bank_b_reg, 1);
7184             input_bank_b_reg =
7185                 vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
7186                                input_bank_b_reg, 3);
7187             input_bank_c_reg =
7188                 vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
7189                                input_bank_c_reg, 1);
7190             input_bank_c_reg =
7191                 vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
7192                                input_bank_c_reg, 3);
7193 
7194             // Iterate over input width shifts within 4x4 blocks.
7195             for (int x = 0; x < output_width; ++x) {
7196               acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
7197                                          0);
7198               acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
7199                                          2);
7200               acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
7201                                          2);
7202               acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
7203                                          2);
7204               acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
7205                                          2);
7206               acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
7207                                          0);
7208               acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
7209                                          0);
7210               acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
7211                                          2);
7212 
7213               // Fixed-point multiplication.
7214               acc0 = vqrdmulhq_s32(acc0, output_multiplier);
7215               acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7216                   acc0, output_shift);
7217               acc1 = vqrdmulhq_s32(acc1, output_multiplier);
7218               acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7219                   acc1, output_shift);
7220               acc2 = vqrdmulhq_s32(acc2, output_multiplier);
7221               acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7222                   acc2, output_shift);
7223               acc3 = vqrdmulhq_s32(acc3, output_multiplier);
7224               acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7225                   acc3, output_shift);
7226               // Add the output offset.
7227               int16x8_t acc_s16_0_1 =
7228                   vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7229               int16x8_t acc_s16_2_3 =
7230                   vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
7231               acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7232               acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
7233               // Apply the activation function.
7234               int8x16_t acc_u8_all = vcombine_u8(vqmovxn_s16(acc_s16_0_1),
7235                                                  vqmovxn_s16(acc_s16_2_3));
7236               acc_u8_all = util_vmaxq_x8(acc_u8_all, output_activation_min_vec);
7237               acc_u8_all = util_vminq_x8(acc_u8_all, output_activation_max_vec);
7238 
7239               vst1q_lane_8x4(output_data, acc_u8_all, 0);
7240               vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
7241               vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
7242                              2);
7243               vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
7244                              3);
7245 
7246               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
7247               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
7248               input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);
7249 
7250               output_data += output_depth;
7251 
7252               acc0 = adjusted_bias_data;
7253               acc1 = adjusted_bias_data;
7254               acc2 = adjusted_bias_data;
7255               acc3 = adjusted_bias_data;
7256 
7257               acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
7258                                          0);
7259               acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
7260                                          0);
7261               acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
7262                                          0);
7263               acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
7264                                          2);
7265             }
7266           }
7267           // scratch_block_data += 4 * workspace_height_stride;
7268           output_data_base += 4 * output_height_stride;
7269 
7270           // Move to next sub-block: advance to second set of filters, to new
7271           // bias.
7272           filter_reg_0_a = filter_reg_0_b;
7273           filter_reg_1_a = filter_reg_1_b;
7274           filter_reg_2_a = filter_reg_2_b;
7275           filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
7276           filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
7277           filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
7278         }
7279       } else {
7280         // Block height < 4.
7281         typename QuantizationTypeImpl<quantization_type>::ExternalType*
7282             output_data_base = output_data_depthwise;
7283 
7284         const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
7285         bias_data += kBiasIncrement;
7286         const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
7287         bias_data += kBiasIncrement;
7288 
7289         const int32x4_t output_shift_a =
7290             vld1q_s32(output_shift_per_channel + j_depth * 8);
7291         const int32x4_t output_multiplier_a =
7292             vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7293         const int32x4_t output_shift_b =
7294             vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7295         const int32x4_t output_multiplier_b =
7296             vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7297 
7298         for (int k_height = 0; k_height < block_height; ++k_height) {
7299           const int8* next_input_data =
7300               scratch_block_data + k_height * workspace_height_stride;
7301           typename QuantizationTypeImpl<quantization_type>::ExternalType*
7302               output_data = output_data_base;
7303 
7304           int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
7305           int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.
7306 
7307           // Load first sub-micro block of data into operational banks.
7308           input_bank_p_reg =
7309               vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
7310                                                 // uninitialized variable.
7311           input_bank_p_reg = vld1q_lane_8x4(
7312               next_input_data + workspace_height_stride, input_bank_p_reg, 2);
7313           input_bank_q_reg = vld1q_dup_s8x4(
7314               next_input_data +
7315               2 * workspace_height_stride);  // Load lane 0, avoiding
7316                                              // uninitialized variable.
7317 
7318           for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7319                ++i_width) {
7320             next_input_data += 4;
7321             const int output_width =
7322                 i_width == output_width_micro_repeats ? residual_width : 4;
7323 
7324             // Load next sub-micro block of data.
7325             input_bank_p_reg =
7326                 vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
7327             input_bank_p_reg = vld1q_lane_8x4(
7328                 next_input_data + workspace_height_stride, input_bank_p_reg, 3);
7329             input_bank_q_reg =
7330                 vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
7331                                input_bank_q_reg, 1);
7332             // Iterate over input width shifts within 4x4 blocks.
7333             for (int x = 0; x < output_width; ++x) {
7334               int32x4_t acc_a = adjusted_bias_data_a;
7335               int32x4_t acc_b = adjusted_bias_data_b;
7336               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
7337                                           input_bank_p_reg, 0);
7338               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
7339                                           input_bank_p_reg, 2);
7340               acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
7341                                           input_bank_q_reg, 0);
7342               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
7343                                           input_bank_p_reg, 0);
7344               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
7345                                           input_bank_p_reg, 2);
7346               acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
7347                                           input_bank_q_reg, 0);
7348 
7349               // Fixed-point multiplication.
7350               acc_a = vqrdmulhq_s32(acc_a, output_multiplier_a);
7351               acc_b = vqrdmulhq_s32(acc_b, output_multiplier_b);
7352               acc_a =
7353                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7354                       acc_a, output_shift_a);
7355               acc_b =
7356                   DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7357                       acc_b, output_shift_b);
7358               // Add the output offset.
7359               int16x8_t acc_s16_0_0 =
7360                   vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
7361               acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
7362               // Apply the activation function.
7363               int8x8_t acc_u8_0_0 = vqmovxn_s16(acc_s16_0_0);
7364               acc_u8_0_0 = util_vmax_x8(acc_u8_0_0,
7365                                         vget_low_s8(output_activation_min_vec));
7366               acc_u8_0_0 = util_vmin_x8(acc_u8_0_0,
7367                                         vget_low_s8(output_activation_max_vec));
7368 
7369               vst1_s8(output_data, acc_u8_0_0);
7370 
7371               input_bank_p_reg = vshrq_n_u64(input_bank_p_reg, 8);
7372               input_bank_q_reg = vshrq_n_u64(input_bank_q_reg, 8);
7373 
7374               output_data += output_depth;
7375             }
7376           }
7377           output_data_base += output_height_stride;
7378         }
7379       }
7380       output_data_depthwise += 8;
7381     }
7382   }  // NOLINT(readability/fn_size) Manually unrolled.
7383 
7384   static inline void Run(const int8* scratch_block_data,
7385                          const int8* filter_workspace, const int32* bias_data,
7386                          int8* output_block_data,
7387                          const DepthwiseConvDotProdParams* function_params) {
7388     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
7389                                output_block_data, function_params);
7390   }
7391 };
7392 
7393 template <>
7394 struct KernelMacroBlock<
7395     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
7396     QuantizationType::kPerChannelInt8,
7397     DepthwiseConvDepthMultiplication::kUnitInputDepth,
7398     /*stride=*/2> {
7399   static inline int8x8_t vqmovxn_s16(int16x8_t x) { return vqmovn_s16(x); }
7400   static inline int8x8_t util_vmin_x8(int8x8_t a, int8x8_t b) {
7401     return vmin_s8(a, b);
7402   }
7403   static inline int8x8_t util_vmax_x8(int8x8_t a, int8x8_t b) {
7404     return vmax_s8(a, b);
7405   }
7406 
7407   static inline void KernelMacroBlockIntrinsics(
7408       const int8* scratch_block_data, const int8* filter_workspace,
7409       const int32* bias_data, int8* output_block_data,
7410       const DepthwiseConvDotProdParams* function_params) {
7411     static constexpr QuantizationType quantization_type =
7412         QuantizationType::kPerChannelInt8;
7413 
7414     const int workspace_height_stride =
7415         function_params->workspace_height_stride;
7416     const int output_width_micro_repeats =
7417         function_params->output_width_micro_repeats;
7418     const int depth_micro_repeats = function_params->depth_micro_repeats;
7419     const int output_depth = function_params->output_depth;
7420     constexpr int kStrideVal = 2;
7421     TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);
7422 
7423     const int output_width_overall_micro_repeats =
7424         function_params->output_width_overall_micro_repeats;
7425     const int block_height = function_params->outbound_block_height;
7426     const int residual_width = function_params->output_residual_width;
7427     const int output_height_stride = function_params->output_height_stride;
7428     constexpr int kBiasIncrement = 4;
7429 
7430     const int32 output_activation_min =
7431         function_params->quantized_activation_min;
7432     const int32 output_activation_max =
7433         function_params->quantized_activation_max;
7434     const int32 output_offset = function_params->output_offset;
7435     const int32* output_shift_per_channel =
7436         function_params->output_shift_per_channel;
7437     const int32* output_multiplier_per_channel =
7438         function_params->output_multiplier_per_channel;
7439     if (quantization_type == QuantizationType::kNonPerChannelUint8) {
7440       TFLITE_DCHECK_GE(output_activation_min, 0);
7441       TFLITE_DCHECK_LT(output_activation_min, 256);
7442       TFLITE_DCHECK_GE(output_activation_max, 0);
7443       TFLITE_DCHECK_LT(output_activation_max, 256);
7444     } else {
7445       TFLITE_DCHECK_GE(output_activation_min, -128);
7446       TFLITE_DCHECK_LT(output_activation_min, 128);
7447       TFLITE_DCHECK_GE(output_activation_max, -128);
7448       TFLITE_DCHECK_LT(output_activation_max, 128);
7449       TFLITE_DCHECK_NE(output_shift_per_channel, nullptr);
7450       TFLITE_DCHECK_NE(output_multiplier_per_channel, nullptr);
7451     }
7452     TFLITE_DCHECK_GE(output_offset, -32878);
7453     TFLITE_DCHECK_LT(output_offset, 32768);
7454 
7455     TFLITE_DCHECK_GE(depth_micro_repeats, 1);
7456 
7457     const int16x8_t output_offset_vec =
7458         vdupq_n_s16(static_cast<int16>(output_offset));
7459     const int8x16_t output_activation_min_vec =
7460         vdupq_n_s8(static_cast<int8>(output_activation_min));
7461     const int8x16_t output_activation_max_vec =
7462         vdupq_n_s8(static_cast<int8>(output_activation_max));
7463 
7464     for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
7465       int8x16_t filter_reg_0_a;
7466       int8x16_t filter_reg_0_b;
7467       int8x16_t filter_reg_1_a;
7468       int8x16_t filter_reg_1_b;
7469       int8x16_t filter_reg_2_a;
7470       int8x16_t filter_reg_2_b;
7471 
7472       filter_reg_0_a = vld1q_s8(filter_workspace);
7473       filter_workspace += 16;
7474       filter_reg_0_b = vld1q_s8(filter_workspace);
7475       filter_workspace += 16;
7476       filter_reg_1_a = vld1q_s8(filter_workspace);
7477       filter_workspace += 16;
7478       filter_reg_1_b = vld1q_s8(filter_workspace);
7479       filter_workspace += 16;
7480       filter_reg_2_a = vld1q_s8(filter_workspace);
7481       filter_workspace += 16;
7482       filter_reg_2_b = vld1q_s8(filter_workspace);
7483       filter_workspace += 16;
7484 
7485       const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
7486       bias_data += kBiasIncrement;
7487       const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
7488       bias_data += kBiasIncrement;
7489 
7490       const int32x4_t output_shift_s_0 =
7491           vld1q_s32(output_shift_per_channel + j_depth * 8);
7492       const int32x4_t output_multiplier_s_0 =
7493           vld1q_s32(output_multiplier_per_channel + j_depth * 8);
7494       const int32x4_t output_shift_s_1 =
7495           vld1q_s32(output_shift_per_channel + j_depth * 8 + 4);
7496       const int32x4_t output_multiplier_s_1 =
7497           vld1q_s32(output_multiplier_per_channel + j_depth * 8 + 4);
7498 
7499       if (block_height == 2) {
7500         const int8* scratch_data = scratch_block_data;
7501         typename QuantizationTypeImpl<quantization_type>::ExternalType*
7502             output_data = output_block_data + 8 * j_depth;
7503 
7504         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
7505         int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
7506         int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
7507 
7508         // Load first sub-micro block of data into operational banks.
7509         input_bank_a_reg =
7510             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
7511                                            // uninitialized variable.
7512         input_bank_a_reg = vld1q_lane_8x4(
7513             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7514         input_bank_b_reg = vld1q_dup_s8x4(
7515             scratch_data +
7516             2 * workspace_height_stride);  // Load lane 0, avoiding
7517                                            // uninitialized variable.
7518         input_bank_b_reg = vld1q_lane_8x4(
7519             scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
7520         input_bank_c_reg = vld1q_dup_s8x4(
7521             scratch_data +
7522             4 * workspace_height_stride);  // Load lane 0, avoiding
7523                                            // uninitialized variable.
7524 
7525         int32x4_t acc0;
7526         int32x4_t acc1;
7527 
7528         // When output_width_micro_repeats < output_width_overall_micro_repeats,
7529         // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
7530         // residual_width < 2.
7531         const int adjusted_width_micro_repeats =
7532             (output_width_micro_repeats < output_width_overall_micro_repeats) &&
7533                     (residual_width < 2)
7534                 ? output_width_micro_repeats
7535                 : output_width_overall_micro_repeats;
7536 
7537         int i_width = 0;
7538         for (; i_width < adjusted_width_micro_repeats; ++i_width) {
7539           const int8* input_data = scratch_data + 4 + 4 * i_width;
7540 
7541           // Load next sub-micro block of data.
7542           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7543           input_bank_a_reg = vld1q_lane_8x4(
7544               input_data + workspace_height_stride, input_bank_a_reg, 3);
7545           input_bank_b_reg = vld1q_lane_8x4(
7546               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7547           input_bank_b_reg = vld1q_lane_8x4(
7548               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7549           input_bank_c_reg = vld1q_lane_8x4(
7550               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7551 
7552           int16x8_t acc_s16_0_1;
7553           int8x8_t acc_u8_0_1;
7554           // Iterate over input width shifts within 4x4 blocks.
7555           {
7556             acc0 = adjusted_bias_data_s_0;
7557             acc1 = adjusted_bias_data_s_0;
7558 
7559             acc0 =
7560                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7561             acc0 =
7562                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7563             acc0 =
7564                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7565             acc1 =
7566                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7567             acc1 =
7568                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7569             acc1 =
7570                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7571 
7572             // Fixed-point multiplication.
7573             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7574             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7575                 acc0, output_shift_s_0);
7576             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7577             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7578                 acc1, output_shift_s_0);
7579             // Add the output offset.
7580             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7581             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7582             // Apply the activation function.
7583             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7584             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7585                                       vget_low_s8(output_activation_min_vec));
7586             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7587                                       vget_low_s8(output_activation_max_vec));
7588 
7589             vst1_lane_8x4(output_data, acc_u8_0_1, 0);
7590             vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7591 
7592             acc0 = adjusted_bias_data_s_1;
7593             acc1 = adjusted_bias_data_s_1;
7594 
7595             acc0 =
7596                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7597             acc0 =
7598                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7599             acc0 =
7600                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7601             acc1 =
7602                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7603             acc1 =
7604                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7605             acc1 =
7606                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7607 
7608             // Fixed-point multiplication.
7609             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7610             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7611                 acc0, output_shift_s_1);
7612             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7613             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7614                 acc1, output_shift_s_1);
7615             // Add the output offset.
7616             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7617             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7618             // Apply the activation function.
7619             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7620             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7621                                       vget_low_s8(output_activation_min_vec));
7622             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7623                                       vget_low_s8(output_activation_max_vec));
7624 
7625             vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
7626             vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7627                           1);
7628 
7629             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7630             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7631             input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7632 
7633             output_data += output_depth;
7634           }
7635 
7636           // output_width == four_over_stride.
7637           acc0 = adjusted_bias_data_s_0;
7638           acc1 = adjusted_bias_data_s_0;
7639 
7640           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7641           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7642           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7643           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7644           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7645           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7646 
7647           // Fixed-point multiplication.
7648           acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7649           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7650               acc0, output_shift_s_0);
7651           acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7652           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7653               acc1, output_shift_s_0);
7654           // Add the output offset.
7655           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7656           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7657           // Apply the activation function.
7658           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7659           acc_u8_0_1 =
7660               util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7661           acc_u8_0_1 =
7662               util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7663 
7664           vst1_lane_8x4(output_data, acc_u8_0_1, 0);
7665           vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7666 
7667           acc0 = adjusted_bias_data_s_1;
7668           acc1 = adjusted_bias_data_s_1;
7669 
7670           acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7671           acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7672           acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7673           acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7674           acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7675           acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7676 
7677           // Fixed-point multiplication.
7678           acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7679           acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7680               acc0, output_shift_s_1);
7681           acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7682           acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7683               acc1, output_shift_s_1);
7684           // Add the output offset.
7685           acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7686           acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7687           // Apply the activation function.
7688           acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7689           acc_u8_0_1 =
7690               util_vmax_x8(acc_u8_0_1, vget_low_s8(output_activation_min_vec));
7691           acc_u8_0_1 =
7692               util_vmin_x8(acc_u8_0_1, vget_low_s8(output_activation_max_vec));
7693 
7694           vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
7695           vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);
7696 
7697           input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7698           input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7699           input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7700 
7701           output_data += output_depth;
7702         }
7703         for (; i_width < output_width_overall_micro_repeats; ++i_width) {
7704           // output_width == 1.
7705           const int8* input_data = scratch_data + 4 + 4 * i_width;
7706 
7707           // Load next sub-micro block of data.
7708           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7709           input_bank_a_reg = vld1q_lane_8x4(
7710               input_data + workspace_height_stride, input_bank_a_reg, 3);
7711           input_bank_b_reg = vld1q_lane_8x4(
7712               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7713           input_bank_b_reg = vld1q_lane_8x4(
7714               input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
7715           input_bank_c_reg = vld1q_lane_8x4(
7716               input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
7717 
7718           int16x8_t acc_s16_0_1;
7719           int8x8_t acc_u8_0_1;
7720           // Iterate over input width shifts within 4x4 blocks.
7721           {
7722             acc0 = adjusted_bias_data_s_0;
7723             acc1 = adjusted_bias_data_s_0;
7724 
7725             acc0 =
7726                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7727             acc0 =
7728                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7729             acc0 =
7730                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7731             acc1 =
7732                 vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
7733             acc1 =
7734                 vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
7735             acc1 =
7736                 vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
7737 
7738             // Fixed-point multiplication.
7739             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7740             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7741                 acc0, output_shift_s_0);
7742             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_0);
7743             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7744                 acc1, output_shift_s_0);
7745             // Add the output offset.
7746             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7747             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7748             // Apply the activation function.
7749             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7750             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7751                                       vget_low_s8(output_activation_min_vec));
7752             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7753                                       vget_low_s8(output_activation_max_vec));
7754 
7755             vst1_lane_8x4(output_data, acc_u8_0_1, 0);
7756             vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
7757 
7758             acc0 = adjusted_bias_data_s_1;
7759             acc1 = adjusted_bias_data_s_1;
7760 
7761             acc0 =
7762                 vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
7763             acc0 =
7764                 vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
7765             acc0 =
7766                 vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
7767             acc1 =
7768                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
7769             acc1 =
7770                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
7771             acc1 =
7772                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
7773 
7774             // Fixed-point multiplication.
7775             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_1);
7776             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7777                 acc0, output_shift_s_1);
7778             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7779             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7780                 acc1, output_shift_s_1);
7781             // Add the output offset.
7782             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7783             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7784             // Apply the activation function.
7785             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7786             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7787                                       vget_low_s8(output_activation_min_vec));
7788             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7789                                       vget_low_s8(output_activation_max_vec));
7790 
7791             vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
7792             vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
7793                           1);
7794 
7795             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7796             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7797             input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
7798 
7799             output_data += output_depth;
7800           }
7801         }
7802       } else {
7803         TFLITE_DCHECK_EQ(block_height, 1);
7804         // Work through one slice, by row, at a time.
7805         const int8* scratch_data = scratch_block_data;
7806         typename QuantizationTypeImpl<quantization_type>::ExternalType*
7807             output_data = output_block_data + 8 * j_depth;
7808 
7809         int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
7810         int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
7811 
7812         // Load first sub-micro block of data into operational banks.
7813         input_bank_a_reg =
7814             vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
7815                                            // uninitialized variable.
7816         input_bank_a_reg = vld1q_lane_8x4(
7817             scratch_data + workspace_height_stride, input_bank_a_reg, 2);
7818         input_bank_b_reg = vld1q_dup_s8x4(
7819             scratch_data +
7820             2 * workspace_height_stride);  // Load lane 0, avoiding
7821                                            // uninitialized variable.
7822 
7823         int32x4_t acc0;
7824         int32x4_t acc1;
7825 
7826         for (int i_width = 0; i_width < output_width_overall_micro_repeats;
7827              ++i_width) {
7828           const int output_width =
7829               i_width == output_width_micro_repeats ? residual_width : 2;
7830 
7831           TFLITE_DCHECK_LE(output_width, 2);
7832           TFLITE_DCHECK_GE(output_width, 1);
7833           TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
7834           const int8* input_data = scratch_data + 4 + 4 * i_width;
7835 
7836           // Load next sub-micro block of data.
7837           input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
7838           input_bank_a_reg = vld1q_lane_8x4(
7839               input_data + workspace_height_stride, input_bank_a_reg, 3);
7840           input_bank_b_reg = vld1q_lane_8x4(
7841               input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
7842 
7843           int16x8_t acc_s16_0_1;
7844           int8x8_t acc_u8_0_1;
7845 
7846           // Iterate over input width shifts within 4x4 blocks.
7847           {
7848             acc0 = adjusted_bias_data_s_0;
7849 
7850             acc0 =
7851                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7852             acc0 =
7853                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7854             acc0 =
7855                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7856 
7857             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7858             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7859                 acc0, output_shift_s_0);
7860 
7861             // Second sub-block accumulation.
7862             acc1 = adjusted_bias_data_s_1;
7863 
7864             acc1 =
7865                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
7866             acc1 =
7867                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
7868             acc1 =
7869                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
7870 
7871             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7872             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7873                 acc1, output_shift_s_1);
7874 
7875             // Add the output offset.
7876             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7877             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7878             // Apply the activation function.
7879             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7880             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7881                                       vget_low_s8(output_activation_min_vec));
7882             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7883                                       vget_low_s8(output_activation_max_vec));
7884 
7885             // This stores the results for both sub-blocks together.
7886             vst1_s8(output_data, acc_u8_0_1);
7887 
7888             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7889             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7890 
7891             output_data += output_depth;
7892           }
7893           if (output_width == 2) {
7894             acc0 = adjusted_bias_data_s_0;
7895 
7896             acc0 =
7897                 vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
7898             acc0 =
7899                 vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
7900             acc0 =
7901                 vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
7902 
7903             acc0 = vqrdmulhq_s32(acc0, output_multiplier_s_0);
7904             acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7905                 acc0, output_shift_s_0);
7906 
7907             // Second sub-block accumulation.
7908             acc1 = adjusted_bias_data_s_1;
7909 
7910             acc1 =
7911                 vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
7912             acc1 =
7913                 vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
7914             acc1 =
7915                 vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
7916 
7917             acc1 = vqrdmulhq_s32(acc1, output_multiplier_s_1);
7918             acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::RunMult(
7919                 acc1, output_shift_s_1);
7920 
7921             // Add the output offset.
7922             acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
7923             acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
7924             // Apply the activation function.
7925             acc_u8_0_1 = vqmovxn_s16(acc_s16_0_1);
7926             acc_u8_0_1 = util_vmax_x8(acc_u8_0_1,
7927                                       vget_low_s8(output_activation_min_vec));
7928             acc_u8_0_1 = util_vmin_x8(acc_u8_0_1,
7929                                       vget_low_s8(output_activation_max_vec));
7930 
7931             // This stores the results for both sub-blocks together.
7932             vst1_s8(output_data, acc_u8_0_1);
7933 
7934             input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
7935             input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
7936 
7937             output_data += output_depth;
7938           }
7939         }
7940       }
7941     }
7942   }
7943 
7944   static inline void Run(const int8* scratch_block_data,
7945                          const int8* filter_workspace, const int32* bias_data,
7946                          int8* output_block_data,
7947                          const DepthwiseConvDotProdParams* function_params) {
7948     KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
7949                                output_block_data, function_params);
7950   }
7951 };
7952 
7953 #undef vst1_lane_8x4
7954 #undef vst1q_lane_8x4
7955 #undef vld1q_lane_s8x8
7956 #undef vld1_lane_8x4
7957 #undef vld1q_lane_8x4
7958 #undef vld1q_dup_s8x4
7959 
7960 #endif  //  USE_NEON
7961 
7962 }  // namespace depthwise_conv
7963 }  // namespace optimized_ops
7964 }  // namespace tflite
7965 
7966 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
7967