• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/qu8-dwconv/unipass-neon-mul8.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 
16 
xnn_qu8_dwconv_minmax_rndnu_ukernel_up32x25__neon_mul8(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qu8_dwconv_minmax_rndnu_ukernel_up32x25__neon_mul8(
18     size_t channels,
19     size_t output_width,
20     const uint8_t** input,
21     const void* weights,
22     uint8_t* output,
23     size_t input_stride,
24     size_t output_increment,
25     size_t input_offset,
26     const uint8_t* zero,
27     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29   assert(channels != 0);
30   assert(output_width != 0);
31 
32   const uint8x8_t vkernel_zero_point = vld1_dup_u8(params->rndnu_neon.kernel_zero_point);
33   const uint16x8_t vkernel_zero_point16 = vmovl_u8(vkernel_zero_point);
34   const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
35   const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
36   const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
37   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
38   const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
39   const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
40   do {
41     const uint8_t* i0 = input[0];
42     assert(i0 != NULL);
43     if XNN_UNPREDICTABLE(i0 != zero) {
44       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
45     }
46     const uint8_t* i1 = input[1];
47     assert(i1 != NULL);
48     if XNN_UNPREDICTABLE(i1 != zero) {
49       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
50     }
51     const uint8_t* i2 = input[2];
52     assert(i2 != NULL);
53     if XNN_UNPREDICTABLE(i2 != zero) {
54       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
55     }
56     const uint8_t* i3 = input[3];
57     assert(i3 != NULL);
58     if XNN_UNPREDICTABLE(i3 != zero) {
59       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
60     }
61     const uint8_t* i4 = input[4];
62     assert(i4 != NULL);
63     if XNN_UNPREDICTABLE(i4 != zero) {
64       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
65     }
66     const uint8_t* i5 = input[5];
67     assert(i5 != NULL);
68     if XNN_UNPREDICTABLE(i5 != zero) {
69       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
70     }
71     const uint8_t* i6 = input[6];
72     assert(i6 != NULL);
73     if XNN_UNPREDICTABLE(i6 != zero) {
74       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
75     }
76     const uint8_t* i7 = input[7];
77     assert(i7 != NULL);
78     if XNN_UNPREDICTABLE(i7 != zero) {
79       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
80     }
81     const uint8_t* i8 = input[8];
82     assert(i8 != NULL);
83     if XNN_UNPREDICTABLE(i8 != zero) {
84       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
85     }
86     const uint8_t* i9 = input[9];
87     assert(i9 != NULL);
88     if XNN_UNPREDICTABLE(i9 != zero) {
89       i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
90     }
91     const uint8_t* i10 = input[10];
92     assert(i10 != NULL);
93     if XNN_UNPREDICTABLE(i10 != zero) {
94       i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
95     }
96     const uint8_t* i11 = input[11];
97     assert(i11 != NULL);
98     if XNN_UNPREDICTABLE(i11 != zero) {
99       i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
100     }
101     const uint8_t* i12 = input[12];
102     assert(i12 != NULL);
103     if XNN_UNPREDICTABLE(i12 != zero) {
104       i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
105     }
106     const uint8_t* i13 = input[13];
107     assert(i13 != NULL);
108     if XNN_UNPREDICTABLE(i13 != zero) {
109       i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
110     }
111     const uint8_t* i14 = input[14];
112     assert(i14 != NULL);
113     if XNN_UNPREDICTABLE(i14 != zero) {
114       i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
115     }
116     const uint8_t* i15 = input[15];
117     assert(i15 != NULL);
118     if XNN_UNPREDICTABLE(i15 != zero) {
119       i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
120     }
121     const uint8_t* i16 = input[16];
122     assert(i16 != NULL);
123     if XNN_UNPREDICTABLE(i16 != zero) {
124       i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
125     }
126     const uint8_t* i17 = input[17];
127     assert(i17 != NULL);
128     if XNN_UNPREDICTABLE(i17 != zero) {
129       i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
130     }
131     const uint8_t* i18 = input[18];
132     assert(i18 != NULL);
133     if XNN_UNPREDICTABLE(i18 != zero) {
134       i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
135     }
136     const uint8_t* i19 = input[19];
137     assert(i19 != NULL);
138     if XNN_UNPREDICTABLE(i19 != zero) {
139       i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
140     }
141     const uint8_t* i20 = input[20];
142     assert(i20 != NULL);
143     if XNN_UNPREDICTABLE(i20 != zero) {
144       i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
145     }
146     const uint8_t* i21 = input[21];
147     assert(i21 != NULL);
148     if XNN_UNPREDICTABLE(i21 != zero) {
149       i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
150     }
151     const uint8_t* i22 = input[22];
152     assert(i22 != NULL);
153     if XNN_UNPREDICTABLE(i22 != zero) {
154       i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
155     }
156     const uint8_t* i23 = input[23];
157     assert(i23 != NULL);
158     if XNN_UNPREDICTABLE(i23 != zero) {
159       i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
160     }
161     const uint8_t* i24 = input[24];
162     assert(i24 != NULL);
163     if XNN_UNPREDICTABLE(i24 != zero) {
164       i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
165     }
166     input = (const uint8_t**) ((uintptr_t) input + input_stride);
167 
168 
169     size_t c = channels;
170     const void* w = weights;
171     for (; c >= 32; c -= 32) {
172       int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
173       int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
174       int32x4_t vacc89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
175       int32x4_t vaccCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
176       int32x4_t vaccGHIJ = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
177       int32x4_t vaccKLMN = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
178       int32x4_t vaccOPQR = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
179       int32x4_t vaccSTUV = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
180 
181 
182       const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
183       const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
184       const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
185       const uint8x8_t vk0x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
186       const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
187       const uint8x8_t vk0xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
188       const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
189       const uint8x8_t vk0xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
190 
191       uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);
192       uint16x8_t vprod89ABCDEF = vmull_u8(vi0x89ABCDEF, vk0x89ABCDEF);
193       uint16x8_t vprodGHIJKLMN = vmull_u8(vi0xGHIJKLMN, vk0xGHIJKLMN);
194       uint16x8_t vprodOPQRSTUV = vmull_u8(vi0xOPQRSTUV, vk0xOPQRSTUV);
195 
196       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
197       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
198       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
199       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
200       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
201       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
202       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
203       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
204       const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
205       const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
206       const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
207       const uint8x8_t vk1x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
208       const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
209       const uint8x8_t vk1xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
210       const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
211       const uint8x8_t vk1xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
212 
213       vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
214       uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
215       vprod89ABCDEF = vmull_u8(vi1x89ABCDEF, vk1x89ABCDEF);
216       uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
217       vprodGHIJKLMN = vmull_u8(vi1xGHIJKLMN, vk1xGHIJKLMN);
218       uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
219       vprodOPQRSTUV = vmull_u8(vi1xOPQRSTUV, vk1xOPQRSTUV);
220       uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
221 
222       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
223       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
224       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
225       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
226       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
227       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
228       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
229       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
230       const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
231       const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
232       const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
233       const uint8x8_t vk2x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
234       const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
235       const uint8x8_t vk2xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
236       const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
237       const uint8x8_t vk2xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
238 
239       vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
240       vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
241       vprod89ABCDEF = vmull_u8(vi2x89ABCDEF, vk2x89ABCDEF);
242       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
243       vprodGHIJKLMN = vmull_u8(vi2xGHIJKLMN, vk2xGHIJKLMN);
244       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
245       vprodOPQRSTUV = vmull_u8(vi2xOPQRSTUV, vk2xOPQRSTUV);
246       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
247 
248       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
249       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
250       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
251       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
252       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
253       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
254       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
255       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
256       const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
257       const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
258       const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
259       const uint8x8_t vk3x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
260       const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
261       const uint8x8_t vk3xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
262       const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
263       const uint8x8_t vk3xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
264 
265       vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
266       vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
267       vprod89ABCDEF = vmull_u8(vi3x89ABCDEF, vk3x89ABCDEF);
268       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
269       vprodGHIJKLMN = vmull_u8(vi3xGHIJKLMN, vk3xGHIJKLMN);
270       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
271       vprodOPQRSTUV = vmull_u8(vi3xOPQRSTUV, vk3xOPQRSTUV);
272       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
273 
274       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
275       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
276       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
277       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
278       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
279       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
280       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
281       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
282       const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
283       const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
284       const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
285       const uint8x8_t vk4x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
286       const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
287       const uint8x8_t vk4xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
288       const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
289       const uint8x8_t vk4xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
290 
291       vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
292       vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
293       vprod89ABCDEF = vmull_u8(vi4x89ABCDEF, vk4x89ABCDEF);
294       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
295       vprodGHIJKLMN = vmull_u8(vi4xGHIJKLMN, vk4xGHIJKLMN);
296       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
297       vprodOPQRSTUV = vmull_u8(vi4xOPQRSTUV, vk4xOPQRSTUV);
298       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
299 
300       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
301       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
302       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
303       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
304       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
305       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
306       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
307       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
308       const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
309       const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
310       const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
311       const uint8x8_t vk5x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
312       const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
313       const uint8x8_t vk5xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
314       const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
315       const uint8x8_t vk5xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
316 
317       vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
318       vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
319       vprod89ABCDEF = vmull_u8(vi5x89ABCDEF, vk5x89ABCDEF);
320       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
321       vprodGHIJKLMN = vmull_u8(vi5xGHIJKLMN, vk5xGHIJKLMN);
322       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
323       vprodOPQRSTUV = vmull_u8(vi5xOPQRSTUV, vk5xOPQRSTUV);
324       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
325 
326       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
327       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
328       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
329       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
330       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
331       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
332       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
333       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
334       const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
335       const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
336       const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
337       const uint8x8_t vk6x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
338       const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
339       const uint8x8_t vk6xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
340       const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
341       const uint8x8_t vk6xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
342 
343       vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
344       vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
345       vprod89ABCDEF = vmull_u8(vi6x89ABCDEF, vk6x89ABCDEF);
346       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
347       vprodGHIJKLMN = vmull_u8(vi6xGHIJKLMN, vk6xGHIJKLMN);
348       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
349       vprodOPQRSTUV = vmull_u8(vi6xOPQRSTUV, vk6xOPQRSTUV);
350       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
351 
352       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
353       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
354       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
355       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
356       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
357       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
358       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
359       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
360       const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8;
361       const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
362       const uint8x8_t vi7x89ABCDEF = vld1_u8(i7); i7 += 8;
363       const uint8x8_t vk7x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
364       const uint8x8_t vi7xGHIJKLMN = vld1_u8(i7); i7 += 8;
365       const uint8x8_t vk7xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
366       const uint8x8_t vi7xOPQRSTUV = vld1_u8(i7); i7 += 8;
367       const uint8x8_t vk7xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
368 
369       vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
370       vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);
371       vprod89ABCDEF = vmull_u8(vi7x89ABCDEF, vk7x89ABCDEF);
372       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi7x89ABCDEF);
373       vprodGHIJKLMN = vmull_u8(vi7xGHIJKLMN, vk7xGHIJKLMN);
374       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi7xGHIJKLMN);
375       vprodOPQRSTUV = vmull_u8(vi7xOPQRSTUV, vk7xOPQRSTUV);
376       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi7xOPQRSTUV);
377 
378       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
379       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
380       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
381       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
382       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
383       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
384       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
385       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
386       const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8;
387       const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
388       const uint8x8_t vi8x89ABCDEF = vld1_u8(i8); i8 += 8;
389       const uint8x8_t vk8x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
390       const uint8x8_t vi8xGHIJKLMN = vld1_u8(i8); i8 += 8;
391       const uint8x8_t vk8xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
392       const uint8x8_t vi8xOPQRSTUV = vld1_u8(i8); i8 += 8;
393       const uint8x8_t vk8xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
394 
395       vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
396       vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);
397       vprod89ABCDEF = vmull_u8(vi8x89ABCDEF, vk8x89ABCDEF);
398       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi8x89ABCDEF);
399       vprodGHIJKLMN = vmull_u8(vi8xGHIJKLMN, vk8xGHIJKLMN);
400       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi8xGHIJKLMN);
401       vprodOPQRSTUV = vmull_u8(vi8xOPQRSTUV, vk8xOPQRSTUV);
402       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi8xOPQRSTUV);
403 
404       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
405       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
406       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
407       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
408       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
409       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
410       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
411       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
412       const uint8x8_t vi9x01234567 = vld1_u8(i9); i9 += 8;
413       const uint8x8_t vk9x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
414       const uint8x8_t vi9x89ABCDEF = vld1_u8(i9); i9 += 8;
415       const uint8x8_t vk9x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
416       const uint8x8_t vi9xGHIJKLMN = vld1_u8(i9); i9 += 8;
417       const uint8x8_t vk9xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
418       const uint8x8_t vi9xOPQRSTUV = vld1_u8(i9); i9 += 8;
419       const uint8x8_t vk9xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
420 
421       vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567);
422       vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567);
423       vprod89ABCDEF = vmull_u8(vi9x89ABCDEF, vk9x89ABCDEF);
424       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi9x89ABCDEF);
425       vprodGHIJKLMN = vmull_u8(vi9xGHIJKLMN, vk9xGHIJKLMN);
426       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi9xGHIJKLMN);
427       vprodOPQRSTUV = vmull_u8(vi9xOPQRSTUV, vk9xOPQRSTUV);
428       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi9xOPQRSTUV);
429 
430       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
431       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
432       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
433       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
434       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
435       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
436       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
437       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
438       const uint8x8_t vi10x01234567 = vld1_u8(i10); i10 += 8;
439       const uint8x8_t vk10x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
440       const uint8x8_t vi10x89ABCDEF = vld1_u8(i10); i10 += 8;
441       const uint8x8_t vk10x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
442       const uint8x8_t vi10xGHIJKLMN = vld1_u8(i10); i10 += 8;
443       const uint8x8_t vk10xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
444       const uint8x8_t vi10xOPQRSTUV = vld1_u8(i10); i10 += 8;
445       const uint8x8_t vk10xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
446 
447       vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567);
448       vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567);
449       vprod89ABCDEF = vmull_u8(vi10x89ABCDEF, vk10x89ABCDEF);
450       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi10x89ABCDEF);
451       vprodGHIJKLMN = vmull_u8(vi10xGHIJKLMN, vk10xGHIJKLMN);
452       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi10xGHIJKLMN);
453       vprodOPQRSTUV = vmull_u8(vi10xOPQRSTUV, vk10xOPQRSTUV);
454       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi10xOPQRSTUV);
455 
456       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
457       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
458       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
459       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
460       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
461       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
462       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
463       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
464       const uint8x8_t vi11x01234567 = vld1_u8(i11); i11 += 8;
465       const uint8x8_t vk11x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
466       const uint8x8_t vi11x89ABCDEF = vld1_u8(i11); i11 += 8;
467       const uint8x8_t vk11x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
468       const uint8x8_t vi11xGHIJKLMN = vld1_u8(i11); i11 += 8;
469       const uint8x8_t vk11xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
470       const uint8x8_t vi11xOPQRSTUV = vld1_u8(i11); i11 += 8;
471       const uint8x8_t vk11xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
472 
473       vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567);
474       vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567);
475       vprod89ABCDEF = vmull_u8(vi11x89ABCDEF, vk11x89ABCDEF);
476       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi11x89ABCDEF);
477       vprodGHIJKLMN = vmull_u8(vi11xGHIJKLMN, vk11xGHIJKLMN);
478       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi11xGHIJKLMN);
479       vprodOPQRSTUV = vmull_u8(vi11xOPQRSTUV, vk11xOPQRSTUV);
480       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi11xOPQRSTUV);
481 
482       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
483       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
484       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
485       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
486       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
487       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
488       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
489       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
490       const uint8x8_t vi12x01234567 = vld1_u8(i12); i12 += 8;
491       const uint8x8_t vk12x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
492       const uint8x8_t vi12x89ABCDEF = vld1_u8(i12); i12 += 8;
493       const uint8x8_t vk12x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
494       const uint8x8_t vi12xGHIJKLMN = vld1_u8(i12); i12 += 8;
495       const uint8x8_t vk12xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
496       const uint8x8_t vi12xOPQRSTUV = vld1_u8(i12); i12 += 8;
497       const uint8x8_t vk12xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
498 
499       vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567);
500       vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567);
501       vprod89ABCDEF = vmull_u8(vi12x89ABCDEF, vk12x89ABCDEF);
502       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi12x89ABCDEF);
503       vprodGHIJKLMN = vmull_u8(vi12xGHIJKLMN, vk12xGHIJKLMN);
504       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi12xGHIJKLMN);
505       vprodOPQRSTUV = vmull_u8(vi12xOPQRSTUV, vk12xOPQRSTUV);
506       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi12xOPQRSTUV);
507 
508       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
509       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
510       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
511       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
512       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
513       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
514       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
515       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
516       const uint8x8_t vi13x01234567 = vld1_u8(i13); i13 += 8;
517       const uint8x8_t vk13x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
518       const uint8x8_t vi13x89ABCDEF = vld1_u8(i13); i13 += 8;
519       const uint8x8_t vk13x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
520       const uint8x8_t vi13xGHIJKLMN = vld1_u8(i13); i13 += 8;
521       const uint8x8_t vk13xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
522       const uint8x8_t vi13xOPQRSTUV = vld1_u8(i13); i13 += 8;
523       const uint8x8_t vk13xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
524 
525       vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567);
526       vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567);
527       vprod89ABCDEF = vmull_u8(vi13x89ABCDEF, vk13x89ABCDEF);
528       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi13x89ABCDEF);
529       vprodGHIJKLMN = vmull_u8(vi13xGHIJKLMN, vk13xGHIJKLMN);
530       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi13xGHIJKLMN);
531       vprodOPQRSTUV = vmull_u8(vi13xOPQRSTUV, vk13xOPQRSTUV);
532       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi13xOPQRSTUV);
533 
534       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
535       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
536       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
537       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
538       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
539       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
540       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
541       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
542       const uint8x8_t vi14x01234567 = vld1_u8(i14); i14 += 8;
543       const uint8x8_t vk14x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
544       const uint8x8_t vi14x89ABCDEF = vld1_u8(i14); i14 += 8;
545       const uint8x8_t vk14x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
546       const uint8x8_t vi14xGHIJKLMN = vld1_u8(i14); i14 += 8;
547       const uint8x8_t vk14xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
548       const uint8x8_t vi14xOPQRSTUV = vld1_u8(i14); i14 += 8;
549       const uint8x8_t vk14xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
550 
551       vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567);
552       vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567);
553       vprod89ABCDEF = vmull_u8(vi14x89ABCDEF, vk14x89ABCDEF);
554       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi14x89ABCDEF);
555       vprodGHIJKLMN = vmull_u8(vi14xGHIJKLMN, vk14xGHIJKLMN);
556       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi14xGHIJKLMN);
557       vprodOPQRSTUV = vmull_u8(vi14xOPQRSTUV, vk14xOPQRSTUV);
558       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi14xOPQRSTUV);
559 
560       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
561       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
562       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
563       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
564       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
565       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
566       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
567       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
568       const uint8x8_t vi15x01234567 = vld1_u8(i15); i15 += 8;
569       const uint8x8_t vk15x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
570       const uint8x8_t vi15x89ABCDEF = vld1_u8(i15); i15 += 8;
571       const uint8x8_t vk15x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
572       const uint8x8_t vi15xGHIJKLMN = vld1_u8(i15); i15 += 8;
573       const uint8x8_t vk15xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
574       const uint8x8_t vi15xOPQRSTUV = vld1_u8(i15); i15 += 8;
575       const uint8x8_t vk15xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
576 
577       vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567);
578       vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567);
579       vprod89ABCDEF = vmull_u8(vi15x89ABCDEF, vk15x89ABCDEF);
580       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi15x89ABCDEF);
581       vprodGHIJKLMN = vmull_u8(vi15xGHIJKLMN, vk15xGHIJKLMN);
582       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi15xGHIJKLMN);
583       vprodOPQRSTUV = vmull_u8(vi15xOPQRSTUV, vk15xOPQRSTUV);
584       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi15xOPQRSTUV);
585 
586       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
587       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
588       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
589       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
590       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
591       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
592       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
593       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
594       const uint8x8_t vi16x01234567 = vld1_u8(i16); i16 += 8;
595       const uint8x8_t vk16x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
596       const uint8x8_t vi16x89ABCDEF = vld1_u8(i16); i16 += 8;
597       const uint8x8_t vk16x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
598       const uint8x8_t vi16xGHIJKLMN = vld1_u8(i16); i16 += 8;
599       const uint8x8_t vk16xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
600       const uint8x8_t vi16xOPQRSTUV = vld1_u8(i16); i16 += 8;
601       const uint8x8_t vk16xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
602 
603       vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567);
604       vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567);
605       vprod89ABCDEF = vmull_u8(vi16x89ABCDEF, vk16x89ABCDEF);
606       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi16x89ABCDEF);
607       vprodGHIJKLMN = vmull_u8(vi16xGHIJKLMN, vk16xGHIJKLMN);
608       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi16xGHIJKLMN);
609       vprodOPQRSTUV = vmull_u8(vi16xOPQRSTUV, vk16xOPQRSTUV);
610       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi16xOPQRSTUV);
611 
612       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
613       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
614       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
615       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
616       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
617       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
618       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
619       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
620       const uint8x8_t vi17x01234567 = vld1_u8(i17); i17 += 8;
621       const uint8x8_t vk17x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
622       const uint8x8_t vi17x89ABCDEF = vld1_u8(i17); i17 += 8;
623       const uint8x8_t vk17x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
624       const uint8x8_t vi17xGHIJKLMN = vld1_u8(i17); i17 += 8;
625       const uint8x8_t vk17xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
626       const uint8x8_t vi17xOPQRSTUV = vld1_u8(i17); i17 += 8;
627       const uint8x8_t vk17xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
628 
629       vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567);
630       vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567);
631       vprod89ABCDEF = vmull_u8(vi17x89ABCDEF, vk17x89ABCDEF);
632       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi17x89ABCDEF);
633       vprodGHIJKLMN = vmull_u8(vi17xGHIJKLMN, vk17xGHIJKLMN);
634       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi17xGHIJKLMN);
635       vprodOPQRSTUV = vmull_u8(vi17xOPQRSTUV, vk17xOPQRSTUV);
636       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi17xOPQRSTUV);
637 
638       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
639       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
640       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
641       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
642       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
643       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
644       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
645       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
646       const uint8x8_t vi18x01234567 = vld1_u8(i18); i18 += 8;
647       const uint8x8_t vk18x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
648       const uint8x8_t vi18x89ABCDEF = vld1_u8(i18); i18 += 8;
649       const uint8x8_t vk18x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
650       const uint8x8_t vi18xGHIJKLMN = vld1_u8(i18); i18 += 8;
651       const uint8x8_t vk18xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
652       const uint8x8_t vi18xOPQRSTUV = vld1_u8(i18); i18 += 8;
653       const uint8x8_t vk18xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
654 
655       vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567);
656       vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567);
657       vprod89ABCDEF = vmull_u8(vi18x89ABCDEF, vk18x89ABCDEF);
658       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi18x89ABCDEF);
659       vprodGHIJKLMN = vmull_u8(vi18xGHIJKLMN, vk18xGHIJKLMN);
660       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi18xGHIJKLMN);
661       vprodOPQRSTUV = vmull_u8(vi18xOPQRSTUV, vk18xOPQRSTUV);
662       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi18xOPQRSTUV);
663 
664       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
665       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
666       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
667       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
668       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
669       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
670       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
671       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
672       const uint8x8_t vi19x01234567 = vld1_u8(i19); i19 += 8;
673       const uint8x8_t vk19x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
674       const uint8x8_t vi19x89ABCDEF = vld1_u8(i19); i19 += 8;
675       const uint8x8_t vk19x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
676       const uint8x8_t vi19xGHIJKLMN = vld1_u8(i19); i19 += 8;
677       const uint8x8_t vk19xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
678       const uint8x8_t vi19xOPQRSTUV = vld1_u8(i19); i19 += 8;
679       const uint8x8_t vk19xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
680 
681       vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567);
682       vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567);
683       vprod89ABCDEF = vmull_u8(vi19x89ABCDEF, vk19x89ABCDEF);
684       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi19x89ABCDEF);
685       vprodGHIJKLMN = vmull_u8(vi19xGHIJKLMN, vk19xGHIJKLMN);
686       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi19xGHIJKLMN);
687       vprodOPQRSTUV = vmull_u8(vi19xOPQRSTUV, vk19xOPQRSTUV);
688       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi19xOPQRSTUV);
689 
690       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
691       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
692       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
693       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
694       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
695       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
696       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
697       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
698       const uint8x8_t vi20x01234567 = vld1_u8(i20); i20 += 8;
699       const uint8x8_t vk20x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
700       const uint8x8_t vi20x89ABCDEF = vld1_u8(i20); i20 += 8;
701       const uint8x8_t vk20x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
702       const uint8x8_t vi20xGHIJKLMN = vld1_u8(i20); i20 += 8;
703       const uint8x8_t vk20xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
704       const uint8x8_t vi20xOPQRSTUV = vld1_u8(i20); i20 += 8;
705       const uint8x8_t vk20xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
706 
707       vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567);
708       vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567);
709       vprod89ABCDEF = vmull_u8(vi20x89ABCDEF, vk20x89ABCDEF);
710       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi20x89ABCDEF);
711       vprodGHIJKLMN = vmull_u8(vi20xGHIJKLMN, vk20xGHIJKLMN);
712       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi20xGHIJKLMN);
713       vprodOPQRSTUV = vmull_u8(vi20xOPQRSTUV, vk20xOPQRSTUV);
714       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi20xOPQRSTUV);
715 
716       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
717       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
718       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
719       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
720       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
721       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
722       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
723       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
724       const uint8x8_t vi21x01234567 = vld1_u8(i21); i21 += 8;
725       const uint8x8_t vk21x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
726       const uint8x8_t vi21x89ABCDEF = vld1_u8(i21); i21 += 8;
727       const uint8x8_t vk21x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
728       const uint8x8_t vi21xGHIJKLMN = vld1_u8(i21); i21 += 8;
729       const uint8x8_t vk21xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
730       const uint8x8_t vi21xOPQRSTUV = vld1_u8(i21); i21 += 8;
731       const uint8x8_t vk21xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
732 
733       vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567);
734       vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567);
735       vprod89ABCDEF = vmull_u8(vi21x89ABCDEF, vk21x89ABCDEF);
736       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi21x89ABCDEF);
737       vprodGHIJKLMN = vmull_u8(vi21xGHIJKLMN, vk21xGHIJKLMN);
738       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi21xGHIJKLMN);
739       vprodOPQRSTUV = vmull_u8(vi21xOPQRSTUV, vk21xOPQRSTUV);
740       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi21xOPQRSTUV);
741 
742       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
743       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
744       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
745       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
746       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
747       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
748       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
749       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
750       const uint8x8_t vi22x01234567 = vld1_u8(i22); i22 += 8;
751       const uint8x8_t vk22x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
752       const uint8x8_t vi22x89ABCDEF = vld1_u8(i22); i22 += 8;
753       const uint8x8_t vk22x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
754       const uint8x8_t vi22xGHIJKLMN = vld1_u8(i22); i22 += 8;
755       const uint8x8_t vk22xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
756       const uint8x8_t vi22xOPQRSTUV = vld1_u8(i22); i22 += 8;
757       const uint8x8_t vk22xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
758 
759       vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567);
760       vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567);
761       vprod89ABCDEF = vmull_u8(vi22x89ABCDEF, vk22x89ABCDEF);
762       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi22x89ABCDEF);
763       vprodGHIJKLMN = vmull_u8(vi22xGHIJKLMN, vk22xGHIJKLMN);
764       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi22xGHIJKLMN);
765       vprodOPQRSTUV = vmull_u8(vi22xOPQRSTUV, vk22xOPQRSTUV);
766       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi22xOPQRSTUV);
767 
768       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
769       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
770       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
771       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
772       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
773       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
774       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
775       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
776       const uint8x8_t vi23x01234567 = vld1_u8(i23); i23 += 8;
777       const uint8x8_t vk23x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
778       const uint8x8_t vi23x89ABCDEF = vld1_u8(i23); i23 += 8;
779       const uint8x8_t vk23x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
780       const uint8x8_t vi23xGHIJKLMN = vld1_u8(i23); i23 += 8;
781       const uint8x8_t vk23xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
782       const uint8x8_t vi23xOPQRSTUV = vld1_u8(i23); i23 += 8;
783       const uint8x8_t vk23xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
784 
785       vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567);
786       vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567);
787       vprod89ABCDEF = vmull_u8(vi23x89ABCDEF, vk23x89ABCDEF);
788       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi23x89ABCDEF);
789       vprodGHIJKLMN = vmull_u8(vi23xGHIJKLMN, vk23xGHIJKLMN);
790       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi23xGHIJKLMN);
791       vprodOPQRSTUV = vmull_u8(vi23xOPQRSTUV, vk23xOPQRSTUV);
792       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi23xOPQRSTUV);
793 
794       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
795       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
796       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
797       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
798       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
799       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
800       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
801       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
802       const uint8x8_t vi24x01234567 = vld1_u8(i24); i24 += 8;
803       const uint8x8_t vk24x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
804       const uint8x8_t vi24x89ABCDEF = vld1_u8(i24); i24 += 8;
805       const uint8x8_t vk24x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
806       const uint8x8_t vi24xGHIJKLMN = vld1_u8(i24); i24 += 8;
807       const uint8x8_t vk24xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
808       const uint8x8_t vi24xOPQRSTUV = vld1_u8(i24); i24 += 8;
809       const uint8x8_t vk24xOPQRSTUV = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
810 
811       vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567);
812       vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567);
813       vprod89ABCDEF = vmull_u8(vi24x89ABCDEF, vk24x89ABCDEF);
814       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi24x89ABCDEF);
815       vprodGHIJKLMN = vmull_u8(vi24xGHIJKLMN, vk24xGHIJKLMN);
816       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi24xGHIJKLMN);
817       vprodOPQRSTUV = vmull_u8(vi24xOPQRSTUV, vk24xOPQRSTUV);
818       vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi24xOPQRSTUV);
819 
820       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
821       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
822       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
823       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
824       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
825       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
826       vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vprodOPQRSTUV)));
827       vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vprodOPQRSTUV)));
828 
829       vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point16)));
830       vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point16)));
831       vacc89AB = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF), vget_low_u16(vkernel_zero_point16)));
832       vaccCDEF = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF), vget_high_u16(vkernel_zero_point16)));
833       vaccGHIJ = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN), vget_low_u16(vkernel_zero_point16)));
834       vaccKLMN = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN), vget_high_u16(vkernel_zero_point16)));
835       vaccOPQR = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV), vget_low_u16(vkernel_zero_point16)));
836       vaccSTUV = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV), vget_high_u16(vkernel_zero_point16)));
837 
838       vacc0123 = vshlq_s32(vacc0123, vright_pre_shift);
839       vacc4567 = vshlq_s32(vacc4567, vright_pre_shift);
840       vacc89AB = vshlq_s32(vacc89AB, vright_pre_shift);
841       vaccCDEF = vshlq_s32(vaccCDEF, vright_pre_shift);
842       vaccGHIJ = vshlq_s32(vaccGHIJ, vright_pre_shift);
843       vaccKLMN = vshlq_s32(vaccKLMN, vright_pre_shift);
844       vaccOPQR = vshlq_s32(vaccOPQR, vright_pre_shift);
845       vaccSTUV = vshlq_s32(vaccSTUV, vright_pre_shift);
846 
847       vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
848       vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
849       vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
850       vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);
851       vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier);
852       vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier);
853       vaccOPQR = vqdmulhq_s32(vaccOPQR, vmultiplier);
854       vaccSTUV = vqdmulhq_s32(vaccSTUV, vmultiplier);
855 
856       vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
857       vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
858       vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift);
859       vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift);
860       vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_post_shift);
861       vaccKLMN = vrshlq_s32(vaccKLMN, vright_post_shift);
862       vaccOPQR = vrshlq_s32(vaccOPQR, vright_post_shift);
863       vaccSTUV = vrshlq_s32(vaccSTUV, vright_post_shift);
864 
865 #if XNN_ARCH_ARM64
866       const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
867       const int16x8_t vacc89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF), voutput_zero_point);
868       const int16x8_t vaccGHIJKLMN = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN), voutput_zero_point);
869       const int16x8_t vaccOPQRSTUV = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV), voutput_zero_point);
870 
871       uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
872       uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV);
873 #else
874       const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
875       const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
876       const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
877       const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point);
878 
879       uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
880       uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));
881 #endif
882 
883       vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
884       voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min);
885 
886       vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
887       voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max);
888 
889       vst1q_u8(output, vout0123456789ABCDEF); output += 16;
890       vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16;
891     }
892     if XNN_UNLIKELY(c != 0) {
893       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 32);
894       do {
895         int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
896         int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
897 
898         const int16x8_t vi0x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i0))); i0 += 8;
899         const int16x8_t vk0x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(k), vkernel_zero_point)); k += 8;
900 
901         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567));
902         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567));
903         const int16x8_t vi1x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i1))); i1 += 8;
904         const int16x8_t vk1x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 24)), vkernel_zero_point));
905 
906         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567));
907         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567));
908         const int16x8_t vi2x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i2))); i2 += 8;
909         const int16x8_t vk2x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 56)), vkernel_zero_point));
910 
911         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567));
912         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567));
913         const int16x8_t vi3x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i3))); i3 += 8;
914         const int16x8_t vk3x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 88)), vkernel_zero_point));
915 
916         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567));
917         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567));
918         const int16x8_t vi4x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i4))); i4 += 8;
919         const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 120)), vkernel_zero_point));
920 
921         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567));
922         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567));
923         const int16x8_t vi5x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i5))); i5 += 8;
924         const int16x8_t vk5x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 152)), vkernel_zero_point));
925 
926         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567));
927         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567));
928         const int16x8_t vi6x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i6))); i6 += 8;
929         const int16x8_t vk6x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 184)), vkernel_zero_point));
930 
931         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567));
932         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567));
933         const int16x8_t vi7x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i7))); i7 += 8;
934         const int16x8_t vk7x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 216)), vkernel_zero_point));
935 
936         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567));
937         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567));
938         const int16x8_t vi8x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i8))); i8 += 8;
939         const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 248)), vkernel_zero_point));
940 
941         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567));
942         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567));
943         const int16x8_t vi9x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i9))); i9 += 8;
944         const int16x8_t vk9x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 280)), vkernel_zero_point));
945 
946         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi9x01234567), vget_low_s16(vk9x01234567));
947         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi9x01234567), vget_high_s16(vk9x01234567));
948         const int16x8_t vi10x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i10))); i10 += 8;
949         const int16x8_t vk10x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 312)), vkernel_zero_point));
950 
951         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi10x01234567), vget_low_s16(vk10x01234567));
952         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi10x01234567), vget_high_s16(vk10x01234567));
953         const int16x8_t vi11x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i11))); i11 += 8;
954         const int16x8_t vk11x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 344)), vkernel_zero_point));
955 
956         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi11x01234567), vget_low_s16(vk11x01234567));
957         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi11x01234567), vget_high_s16(vk11x01234567));
958         const int16x8_t vi12x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i12))); i12 += 8;
959         const int16x8_t vk12x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 376)), vkernel_zero_point));
960 
961         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi12x01234567), vget_low_s16(vk12x01234567));
962         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi12x01234567), vget_high_s16(vk12x01234567));
963         const int16x8_t vi13x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i13))); i13 += 8;
964         const int16x8_t vk13x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 408)), vkernel_zero_point));
965 
966         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi13x01234567), vget_low_s16(vk13x01234567));
967         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi13x01234567), vget_high_s16(vk13x01234567));
968         const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); i14 += 8;
969         const int16x8_t vk14x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 440)), vkernel_zero_point));
970 
971         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567));
972         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567));
973         const int16x8_t vi15x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i15))); i15 += 8;
974         const int16x8_t vk15x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 472)), vkernel_zero_point));
975 
976         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi15x01234567), vget_low_s16(vk15x01234567));
977         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi15x01234567), vget_high_s16(vk15x01234567));
978         const int16x8_t vi16x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i16))); i16 += 8;
979         const int16x8_t vk16x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 504)), vkernel_zero_point));
980 
981         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi16x01234567), vget_low_s16(vk16x01234567));
982         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi16x01234567), vget_high_s16(vk16x01234567));
983         const int16x8_t vi17x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i17))); i17 += 8;
984         const int16x8_t vk17x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 536)), vkernel_zero_point));
985 
986         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi17x01234567), vget_low_s16(vk17x01234567));
987         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi17x01234567), vget_high_s16(vk17x01234567));
988         const int16x8_t vi18x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i18))); i18 += 8;
989         const int16x8_t vk18x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 568)), vkernel_zero_point));
990 
991         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi18x01234567), vget_low_s16(vk18x01234567));
992         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi18x01234567), vget_high_s16(vk18x01234567));
993         const int16x8_t vi19x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i19))); i19 += 8;
994         const int16x8_t vk19x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 600)), vkernel_zero_point));
995 
996         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi19x01234567), vget_low_s16(vk19x01234567));
997         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi19x01234567), vget_high_s16(vk19x01234567));
998         const int16x8_t vi20x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i20))); i20 += 8;
999         const int16x8_t vk20x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 632)), vkernel_zero_point));
1000 
1001         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi20x01234567), vget_low_s16(vk20x01234567));
1002         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi20x01234567), vget_high_s16(vk20x01234567));
1003         const int16x8_t vi21x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i21))); i21 += 8;
1004         const int16x8_t vk21x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 664)), vkernel_zero_point));
1005 
1006         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi21x01234567), vget_low_s16(vk21x01234567));
1007         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi21x01234567), vget_high_s16(vk21x01234567));
1008         const int16x8_t vi22x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i22))); i22 += 8;
1009         const int16x8_t vk22x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 696)), vkernel_zero_point));
1010 
1011         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi22x01234567), vget_low_s16(vk22x01234567));
1012         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi22x01234567), vget_high_s16(vk22x01234567));
1013         const int16x8_t vi23x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i23))); i23 += 8;
1014         const int16x8_t vk23x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 728)), vkernel_zero_point));
1015 
1016         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi23x01234567), vget_low_s16(vk23x01234567));
1017         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi23x01234567), vget_high_s16(vk23x01234567));
1018         const int16x8_t vi24x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i24))); i24 += 8;
1019         const int16x8_t vk24x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 760)), vkernel_zero_point));
1020 
1021         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi24x01234567), vget_low_s16(vk24x01234567));
1022         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi24x01234567), vget_high_s16(vk24x01234567));
1023 
1024         vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift);
1025         vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift);
1026 
1027         vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
1028         vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
1029 
1030         vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
1031         vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
1032 
1033 #if XNN_ARCH_ARM64
1034         const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
1035         uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
1036 #else
1037         const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
1038         uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
1039 #endif
1040 
1041         vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
1042         vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
1043 
1044         if XNN_LIKELY(c >= 8) {
1045           vst1_u8(output, vout01234567); output += 8;
1046           c -= 8;
1047         } else {
1048           if (c & 4) {
1049             vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
1050             vout01234567 = vext_u8(vout01234567, vout01234567, 4);
1051           }
1052           if (c & 2) {
1053             vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
1054             vout01234567 = vext_u8(vout01234567, vout01234567, 2);
1055           }
1056           if (c & 1) {
1057             vst1_lane_u8(output, vout01234567, 0); output += 1;
1058           }
1059           c = 0;
1060         }
1061       } while (c != 0);
1062     }
1063 
1064     output = (uint8_t*) ((uintptr_t) output + output_increment);
1065   } while (--output_width != 0);
1066 }
1067